In [33]:

#Adding libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


**Approach:**
First step was to import all the required libraries and also the train and test files into the notebook. By inspecting the data, we found out that there does not exits null values or data redundancies in train and test data set therfore removing the requirement for data cleaning/data imputation. Our next step was to visualize the training dataset which resulted in our finding that content made for students will be very much beneficial to any video platforms since the engagement score of students was greatest of the three. Next, since there exist 2 categorical features in our training dataset, i gave them tags using label encoding. Next step was seperating the train data into training and validation data where we split them into 3:1 ratio. Next, we imported different ML models and assessed them on the basis of r2_score as evaluation metric. We found that XGB regressor was the best among all the models chosen in order to predict engagement score. Last but not the least, we submitted our data predicted using XGB regressor model keeping only the postive engagement scores in the sheet and removing the negative score if any.

In [34]:
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics


import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

**1) Reading dataset**

In [35]:
def load_data():
    # Read data
    data_dir = Path("../input/jobathon-february-2022")
    
    df_train = pd.read_csv(data_dir / "train_0OECtn8.csv", index_col="row_id")
    df_test = pd.read_csv(data_dir / "test_1zqHu22.csv", index_col="row_id")

    return df_train, df_test


In [36]:
train = pd.read_csv('../input/jobathon-february-2022/train.csv') 
test = pd.read_csv('../input/jobathon-february-2022/test.csv') 
train.shape, test.shape


**2) Data Inspection**

In [37]:
#Inspecting Data by checking ratio of null values for train dataset

train.isnull().sum()/train.shape[0] *100


We can see above that no data is null in the train dataset.  

In [38]:
#Inspecting Data by checking null values ratio of null values for test 

test.isnull().sum()/test.shape[0] *100

We can see above that no data is null in the test dataset.  

In [39]:
#Determining number of categorical and numerical features in the train dataset. 
#categorical features
categorical = train.select_dtypes(include =[np.object])
print("Categorical Features present in Train Set:",categorical.shape[1])

#numerical features
numerical= train.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features present in Train Set:",numerical.shape[1])

In [40]:
#Determining number of categorical and numerical features in the test dataset. 
#categorical features
categorical = test.select_dtypes(include =[np.object])
print("Categorical Features present in test Set:",categorical.shape[1])

#numerical features
numerical= test.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features present in test Set:",numerical.shape[1])

***3) Data Cleaning ***


In [41]:
#Since no null values in train and test dataset, no data cleaning is required 

**4) Checking Data Redundancies**

In [42]:
train.head()

In [43]:
train.columns

In [44]:
train['profession'].value_counts()   # we can see that there is no redundancies in the profession data. Therefore no need to fix this data.  
                                    


In [45]:
train['gender'].value_counts()  # we can see that there is no redundancies in the gender data. Therefore no need to fix this data. 

**5) Data Visualization**

In [46]:
plt.figure(figsize=(25,7))
sns.countplot('gender',data=train,palette='spring')

In [47]:
plt.figure(figsize=(25,7))
sns.countplot('profession',data=train,palette='summer')

In [48]:
plt.figure(figsize=(25,7))
sns.countplot('age',data=train,palette='twilight')

In [49]:
plt.figure(figsize=(10,8))
sns.barplot(y='profession',x='engagement_score',data=train,palette='flag')
# We can see that the content made for students will be very much beneficial to any video platforms since the engagement score
# of students if greatest of the three. 

**7) Encoding Categorical Variables**


In [50]:
train.head()

In [51]:
# Labelencoding
le = LabelEncoder()
var_mod = train.select_dtypes(include='object').columns
for i in var_mod:
    train[i] = le.fit_transform(train[i])
    
for i in var_mod:
    test[i] = le.fit_transform(test[i])

In [52]:
train.columns

**8) Data Seperation**

In [53]:
#seperating feature and target variable 
X= train.drop(columns = ['engagement_score'], axis=1)
y= train['engagement_score']

In [54]:
# Using 25% of data as validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.25,random_state=22)

**9) ML Models**

In [55]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

In [56]:
# my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
#                               ('model', RandomForestRegressor(n_estimators=50,
#                                                               random_state=0))
#                              ])

In [57]:
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import r2_score

# # Multiply by -1 since sklearn calculates *negative* MAE
# scores = -1 * cross_val_score(my_pipeline, X, y,
#                               cv=5,
#                               scoring= 'r2')

# print("score :\n", scores)

In [58]:
technique = [LinearRegression(),  Ridge(), Lasso(),
          KNeighborsRegressor(), DecisionTreeRegressor(), XGBRegressor(max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    seed=42), RandomForestRegressor(), 
             BayesianRidge()    ]

# technique = [RandomForestRegressor(random_state = 0) ]

technique_names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression',
         'K Neighbors Regressor', 'Decision Tree Regressor', 'XGBRegressor' , 
                   'Random Forest Regressor', 'Bayesian Ridge'   ]
# technique_names = ['Random Forest Regressor']


r2_list = []

In [59]:
r2_list

***9.1 Evaluating models using R2 score ***


In [60]:
for name in technique:
    model = name
    model.fit(X_train,y_train)
    y_pred = model.predict(X_valid)
    score = metrics.r2_score(y_valid, y_pred )    #using r2 score as evaluation metric as defined in the problem
    r2_list.append(score)
   

In [61]:
r2_list

In [62]:
evaluation = pd.DataFrame({'Model': technique_names,
                           'score': r2_list})

In [63]:
evaluation

We can see that among all the models XGB Regressor has the best score. Therefore, chosing XGB Regressor model for predicting engagement score. 

**10) Submission of Prediction Data**

In [64]:

submission = pd.read_csv('../input/jobathon-february-2022/sample_submission.csv')
model = XGBRegressor(max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    seed=42) 
model.fit(X, y)
final_predictions = model.predict(test)
submission['engagement_score'] = final_predictions
#only positive predictions for the target variable
submission['engagement_score'] = submission['engagement_score'].apply(lambda x: 0 if x<0 else x)
submission.to_csv('my_submission_XGB_4.csv', index=False)

**Approach:** 
First step was to import all the required libraries and also the train and test files into the 
notebook. By inspecting the data, we found out that there does not exits null values or data redundancies in train and test data set
therfore removing the requirement for data cleaning/data imputation. Our next step was to visualize the training dataset which resulted in our finding 
that content made for students will be very much beneficial to any video platforms since the engagement score
of students was greatest of the three. Next, since there exist 2 categorical features in our training dataset, i gave them tags using label encoding. 
Next step was seperating the train data into training and validation data where we split them into 3:1 ratio. Next, we imported different ML models and assessed them 
on the basis of r2_score as evaluation metric. We found that XGB regressor was the best among all the models chosen in order to predict engagement score.
Last but not the least, we submitted our  data predicted using XGB regressor
model keeping only the postive engagement scores in the sheet and removing the negative score if any. 