# Prediciting Engagement Score using Ridge Regression

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Scikit Learn

In [2]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## Load Dataset

In [3]:
df_train = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/results/postEngagement_train.tsv', index_col='Unnamed: 0', sep='\t')
df_test = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/results/postEngagement_test.tsv', index_col='Unnamed: 0', sep='\t') 

In [4]:
# Separate features and target for training dataset
X_train = df_train.drop(['engagement_score'], axis=1)
y_train = df_train['engagement_score']

In [5]:
# Separate features and target for testing dataset
X_test = df_test.drop(['engagement_score'], axis=1)
y_test = df_test['engagement_score']

## Ridge Regression

### Feature selection using RandomForestClassifier--feature_importances

In [6]:
# Instantiate RandomForestClassifier
rf = RandomForestClassifier(random_state=35).fit(X_train, y_train)
rf_feat_imp = rf.feature_importances_
# Feature importance
feature_importance = pd.DataFrame(rf_feat_imp, index=X_train.columns, columns=["Feat_Imp"])

In [7]:
#Sort in descending order to get Most Important to Least Important
feature_importance.sort_values("Feat_Imp",ascending=False, inplace=True)
feature_importance.reset_index(inplace=True)

In [8]:
# Top 10 Feature Importance
top5_feature = feature_importance.iloc[:5]
predictor = list(top5_feature['index'])
predictor

['Topic 0', 'hour_day', 'Topic 1', 'Topic 2', 'Topic 4']

### Hyperparameter Optimization using GridSearch

In [9]:
param_grid = {'alpha': [1, 0.1, 0.001, 0.0001, 0.00001, 0.000001],
              'normalize': [True]}

In [10]:
# Instantiate Ridge
reg = Ridge()
# Instantiate GridSearchCV
grid_search = GridSearchCV(reg, param_grid=param_grid, cv=5)

In [11]:
# Fit X_train
grid_search.fit(X_train[predictor], y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [1, 0.1, 0.001, 0.0001, 1e-05, 1e-06], 'normalize': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
print("Best estimator:")
print(grid_search.best_estimator_)
print("Best parameters:")
print(grid_search.best_params_)
print("Best score:")
print(grid_search.best_score_)

Best estimator:
Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
   random_state=None, solver='auto', tol=0.001)
Best parameters:
{'alpha': 1, 'normalize': True}
Best score:
-0.042569737845562845


### Build Ridge Model

In [13]:
# Instantiate SVC
ridgereg = Ridge(alpha=1, normalize=True)
# Fit the training dataset on the svm classifier
ridgereg.fit(X_train[predictor], y_train)
# Predict the labels on validation dataset
y_pred = ridgereg.predict(X_test[predictor])

### Evaluate SVR Model

In [14]:
# Get Metrics
r2 = r2_score(y_pred, y_test)
MAE = mean_absolute_error(y_pred, y_test)
MSE = mean_squared_error(y_pred, y_test)
# Store metrics in dictionary
score_dict = {'SVR': pd.Series([r2, MAE, MSE], index=['R2', 'MAE', 'MSE'])}
# Transform dictionary to df
df_metrics = pd.DataFrame(score_dict)
df_metrics

Unnamed: 0,SVR
R2,-321.4948
MAE,2356.247
MSE,89652750.0


## Cross Validation

In [15]:
print('Mean Cross validated score -> ', np.mean(cross_val_score(ridgereg, X_train[predictor], y_train, cv=5)*100))

Mean Cross validated score ->  -4.2621143920305515
