In [14]:
# Libraries
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.options.display.max_columns = None

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import gc
from tqdm import tqdm
path = '/home/sachin.rathi/All_Code/Absolutdata/data/'

In [15]:
# read train and test data
df = pd.read_csv(path+'train_0OECtn8.csv')
test = pd.read_csv(path+'test_1zqHu22.csv')

df.shape,test.shape

((89197, 10), (11121, 9))

### EDA

In [16]:
# check missin values
df.isnull().sum()

row_id              0
user_id             0
category_id         0
video_id            0
age                 0
gender              0
profession          0
followers           0
views               0
engagement_score    0
dtype: int64

In [17]:
# User who upload more than 5 videos
# ab = [i for i in df['user_id'].unique() if len(df[df['user_id'] == i]) > 5]

In [18]:
# fina unique user id both dataset
df['user_id'].nunique()
test['user_id'].nunique()

27734

10384

In [19]:
# find users which are in test data but not in train data
len(set(test['user_id']).difference(set(df['user_id'])))

# find users which are in train data but not in test data
len(set(df['user_id']).difference(set(test['user_id'])))

0

17350

In [20]:
# check unique values in each column
for i in df.columns:
    print(f"columns {i} have {df[i].nunique()} label in data")

columns row_id have 89197 label in data
columns user_id have 27734 label in data
columns category_id have 47 label in data
columns video_id have 175 label in data
columns age have 58 label in data
columns gender have 2 label in data
columns profession have 3 label in data
columns followers have 17 label in data
columns views have 43 label in data
columns engagement_score have 229 label in data


In [21]:
gc.collect()

180

#### Data cleaning

In [22]:
# filter train data with only those users which are in test data
df = df.loc[df.set_index(['user_id']).index.isin(test.set_index(['user_id']).index)]

In [23]:
#check reshape again and unique user in both data
df.shape,test.shape

# Check unique user again on each data
df['user_id'].nunique()
test['user_id'].nunique()

((39204, 10), (11121, 9))

10384

10384

In [24]:
# reset index for both dataset
df.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

df.head(3)
test.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,3,1840,12,24,19,Male,Student,180,138,4.35
1,6,9323,25,139,35,Male,Other,240,317,3.33
2,7,2071,7,14,23,Male,Student,160,467,3.8


Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
0,89198,7986,12,42,14,Male,Student,180,138
1,89199,11278,34,115,14,Male,Student,230,840
2,89200,17245,8,110,44,Female,Working Professional,280,628


In [25]:
round(df[df['user_id'] == 2]['engagement_score'].min(),2)

2.65

In [26]:
# Get mean of engagement_score for each user in training data.
cv_1 = {}
cv_2 = {}
cv_3 = {}
cv_4 = {}
cv_5 = {}
cv_6 = {}
# cv_4 = {}
for i in tqdm(df['user_id'].unique()):
    # Overall mean
    cv_1[i] = round(df[df['user_id'] == i]['engagement_score'].mean(),2)
    
    # Mean from first 2 row
    cv_2[i] = round(df[df['user_id'] == i].head(2)['engagement_score'].mean(),2)
    
    # over all std
    cv_3[i] = round(df[df['user_id'] == i]['engagement_score'].std(),2)
    
    # std from first 2 row
    cv_4[i] = round(df[df['user_id'] == i].head(2)['engagement_score'].std(),2)
    
        # over all std
    cv_5[i] = round(df[df['user_id'] == i]['engagement_score'].min(),2)
    
    # std from first 2 row
    cv_6[i] = round(df[df['user_id'] == i].head(2)['engagement_score'].max(),2)

# Create df using dictionary
cv_1 = pd.DataFrame.from_dict(cv_1,orient = 'index').reset_index()
cv_1.columns = ['user_id','mean']

cv_2 = pd.DataFrame.from_dict(cv_2,orient = 'index').reset_index()
cv_2.columns = ['user_id','mean_2']

cv_3 = pd.DataFrame.from_dict(cv_3,orient = 'index').reset_index()
cv_3.columns = ['user_id','std']

cv_4 = pd.DataFrame.from_dict(cv_4,orient = 'index').reset_index()
cv_4.columns = ['user_id','std_2']

cv_5 = pd.DataFrame.from_dict(cv_5,orient = 'index').reset_index()
cv_5.columns = ['user_id','min']

cv_6 = pd.DataFrame.from_dict(cv_6,orient = 'index').reset_index()
cv_6.columns = ['user_id','max']

100%|██████████| 10384/10384 [00:30<00:00, 340.67it/s]


In [27]:
# set test data index as negative ( which help to divide the dataset again)
index = np.arange(1,len(test)+1)
inde = [-i for i in index]
test.index= inde

test.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
-1,89198,7986,12,42,14,Male,Student,180,138
-2,89199,11278,34,115,14,Male,Student,230,840
-3,89200,17245,8,110,44,Female,Working Professional,280,628


#### Feature Engineering

In [28]:
# concat train and test data to create lag variables
final = pd.concat([df,test],axis=0)

# final = final.sort_values(['user_id'])
final = final.fillna(0)

In [29]:
# Create lag variables

In [30]:
final.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,3,1840,12,24,19,Male,Student,180,138,4.35
1,6,9323,25,139,35,Male,Other,240,317,3.33
2,7,2071,7,14,23,Male,Student,160,467,3.8
3,8,21848,8,100,18,Male,Student,280,628,3.87
4,10,16058,5,161,19,Male,Student,240,229,3.8


In [31]:
# create lag feature of engagement_score user wise
final['engagement_score_lag_1'] = final.groupby(['user_id'])['engagement_score'].shift(1)

# create lag feature of engagement_score user wise and category wise
final['engagement_score_cat_wise_1'] = final.groupby(['user_id','category_id'])['engagement_score'].shift(1)

In [32]:
final = final.fillna(0)

In [33]:
# Map column into data
final['Mean'] = final['user_id'].map(dict(zip(cv_1['user_id'],cv_1['mean'])))
final['Mean_2'] = final['user_id'].map(dict(zip(cv_2['user_id'],cv_2['mean_2'])))
final['std'] = final['user_id'].map(dict(zip(cv_3['user_id'],cv_3['std'])))
final['std_2'] = final['user_id'].map(dict(zip(cv_4['user_id'],cv_4['std_2'])))
final['min'] = final['user_id'].map(dict(zip(cv_4['user_id'],cv_5['min'])))
final['max'] = final['user_id'].map(dict(zip(cv_4['user_id'],cv_6['max'])))

In [34]:
final.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,engagement_score_lag_1,engagement_score_cat_wise_1,Mean,Mean_2,std,std_2,min,max
0,3,1840,12,24,19,Male,Student,180,138,4.35,0.0,0.0,4.19,4.06,0.37,0.41,3.77,4.35
1,6,9323,25,139,35,Male,Other,240,317,3.33,0.0,0.0,3.39,3.38,0.05,0.07,3.33,3.43
2,7,2071,7,14,23,Male,Student,160,467,3.8,0.0,0.0,3.64,4.0,0.6,0.29,2.8,4.21
3,8,21848,8,100,18,Male,Student,280,628,3.87,0.0,0.0,3.34,3.49,0.6,0.54,2.6,3.87
4,10,16058,5,161,19,Male,Student,240,229,3.8,0.0,0.0,3.59,3.81,0.42,0.02,2.96,3.83


In [35]:
# final[final['user_id'] == 2]
# final[(final['user_id'] == 2) & (final['category_id'] == 3)]

In [36]:
# separate train and test data again to model training 
df = final[final.index>0]
test = final[final.index<0]

In [37]:
gc.collect()

80

### Try Auto ML

In [38]:
# data = df.drop('row_id',axis=1)
# from pycaret.regression import *
# exp_reg101 = setup(data = data, target = 'engagement_score', session_id=123) 

# best = compare_models()

# evaluate_model(best)

### Check Model accuracy on train data/Validation data with Train-Test Split

#### Train-test Split

In [39]:
# # train_test split
# X = df.drop(['engagement_score'],axis=1)
# y = df['engagement_score']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Machine Learning Model on Train and Validation data

In [40]:
# ## Train dummification categorical variables
# import xgboost
# from xgboost import plot_importance

# # create copy of data
# X_train_xgb = X_train.copy()
# X_test_xgb = X_test.copy()

# # OHE for categorical variables
# v_final_train = pd.get_dummies(X_train_xgb[['gender', 'profession']])
# X_train_xgb = pd.concat((X_train_xgb[['row_id','user_id', 'category_id', 'video_id', 'age', 'followers', 'views']],v_final_train), axis = 1)

# v_final_test = pd.get_dummies(X_test_xgb[['gender', 'profession']])
# X_test_xgb = pd.concat((X_test_xgb[['row_id','user_id', 'category_id', 'video_id', 'age', 'followers', 'views']],v_final_test), axis = 1)

# # fit model with best tuned parameter

# # xgb_model = xgboost.XGBRegressor(seed=42)

# best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
#                  gamma=0,                 
#                  learning_rate=0.07,
#                  max_depth=3,
#                  min_child_weight=1.5,
#                  n_estimators=10000,                                                                    
#                  reg_alpha=0.75,
#                  reg_lambda=0.45,
#                  subsample=0.6,
#                  seed=42)

# # Fit train data
# best_xgb_model.fit(X_train_xgb.drop(['row_id'],axis=1),y_train)

# # prediction on validation data
# y_pred_xgb = best_xgb_model.predict(X_test_xgb.drop(['row_id'],axis=1))

# # check model performance.
# print(f"R2 from XGB model is: {r2_score(y_test,y_pred_xgb)}")

In [41]:
gc.collect()

80

### Prediction On Unseen Data

In [42]:
df.head(3)
test.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,engagement_score_lag_1,engagement_score_cat_wise_1,Mean,Mean_2,std,std_2,min,max
1,6,9323,25,139,35,Male,Other,240,317,3.33,0.0,0.0,3.39,3.38,0.05,0.07,3.33,3.43
2,7,2071,7,14,23,Male,Student,160,467,3.8,0.0,0.0,3.64,4.0,0.6,0.29,2.8,4.21
3,8,21848,8,100,18,Male,Student,280,628,3.87,0.0,0.0,3.34,3.49,0.6,0.54,2.6,3.87


Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,engagement_score_lag_1,engagement_score_cat_wise_1,Mean,Mean_2,std,std_2,min,max
-1,89198,7986,12,42,14,Male,Student,180,138,0.0,4.14,4.14,4.2,4.2,0.08,0.08,4.14,4.25
-2,89199,11278,34,115,14,Male,Student,230,840,0.0,3.87,3.87,4.15,4.28,0.28,0.26,3.87,4.46
-3,89200,17245,8,110,44,Female,Working Professional,280,628,0.0,2.32,2.32,2.23,1.98,0.3,0.37,1.71,2.24


In [43]:
# fit XGB Model
import xgboost

# create copy of data
X_train_xgb = df.drop(['engagement_score'],axis=1)
y_train = df['engagement_score']

del test['engagement_score']
X_test_xgb = test.copy()

# OHE for categorical variables for training data
v_final_train = pd.get_dummies(X_train_xgb[['gender', 'profession','category_id']])
X_train_xgb = pd.concat((X_train_xgb[['row_id','user_id', 'video_id', 'age', 'followers','Mean','Mean_2','std','std_2','min','max',
                                      'views','engagement_score_lag_1', 'engagement_score_cat_wise_1']],v_final_train), axis = 1)

# OHE for categorical variables for testing data
v_final_test = pd.get_dummies(X_test_xgb[['gender', 'profession','category_id']])
X_test_xgb = pd.concat((X_test_xgb[['row_id','user_id', 'video_id', 'age', 'followers','Mean','Mean_2','std','std_2','min','max',
                                    'views','engagement_score_lag_1', 'engagement_score_cat_wise_1']],v_final_test), axis = 1)

# fit model with best tuned parameter
best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

# Model Fit train data
best_xgb_model.fit(X_train_xgb.drop(['row_id'],axis=1),y_train)

# Model Prediction on Unseen data
y_pred_xgb = best_xgb_model.predict(X_test_xgb.drop(['row_id'],axis=1))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.07, max_delta_step=0, max_depth=3,
             min_child_weight=1.5, missing=nan, monotone_constraints='()',
             n_estimators=10000, n_jobs=0, num_parallel_tree=1, random_state=42,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
             subsample=0.6, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [44]:
gc.collect()

29

### Submission

In [46]:
test['engagement_score'] = y_pred_xgb
submission = test[['row_id','engagement_score']]

submission.to_csv('Final_submission_Sachin_Rathi_3.csv',index=False)