## Video course preference
### Predict weather the end user will like the video course based on different video parameters and instructor emotions

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Import dataset
dataset = pd.read_csv('./dataset.csv')
dataset.head()

Unnamed: 0,Video Length,Speaking Rate,Clarity,Gender,Smile,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise,Facial Hair,user_preference
0,10,113,0.86,M,0.8,0.31,0.39,0.45,0.38,0.75,0.36,0.42,0.33,0.76,4.95
1,10,138,0.58,F,0.11,0.18,0.03,0.49,0.5,0.49,0.8,0.2,0.69,0.93,4.42
2,3,109,0.03,F,0.85,0.2,0.96,0.47,0.02,0.02,0.04,0.75,0.97,0.5,4.78
3,4,117,0.89,M,0.02,0.44,0.29,0.11,0.77,0.73,0.87,0.15,0.62,0.54,4.54
4,1,135,0.84,M,0.14,0.08,0.64,0.65,0.76,0.83,0.04,0.06,0.6,0.93,4.73


In [3]:
dataset.keys()

Index(['Video Length', 'Speaking Rate', 'Clarity', 'Gender', 'Smile', 'Anger',
       'Contempt', 'Disgust', 'Fear', 'Happiness', 'Neutral', 'Sadness',
       'Surprise', 'Facial Hair', 'user_preference'],
      dtype='object')

In [4]:
# Splitting data into features (X) and labels (y)
X = dataset.drop('user_preference', axis=1)
y = dataset['user_preference']

In [5]:
# Hot encoding the instructor gender
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

hot_encoder = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', hot_encoder, ["Gender"])], remainder='passthrough')
X_transformed = transformer.fit_transform(X=X)
X_transformed = pd.DataFrame(X_transformed)
X_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,10.0,113.0,0.86,0.8,0.31,0.39,0.45,0.38,0.75,0.36,0.42,0.33,0.76
1,1.0,0.0,10.0,138.0,0.58,0.11,0.18,0.03,0.49,0.5,0.49,0.8,0.2,0.69,0.93
2,1.0,0.0,3.0,109.0,0.03,0.85,0.2,0.96,0.47,0.02,0.02,0.04,0.75,0.97,0.5
3,0.0,1.0,4.0,117.0,0.89,0.02,0.44,0.29,0.11,0.77,0.73,0.87,0.15,0.62,0.54
4,0.0,1.0,1.0,135.0,0.84,0.14,0.08,0.64,0.65,0.76,0.83,0.04,0.06,0.6,0.93


In [6]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, shuffle=True, test_size=0.2)
len(X_train),len(X_test)

(800, 200)

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Initialise Random forest classifier
model = RandomForestRegressor(n_estimators=200)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200)

In [8]:
model.score(X_test, y_test)
y_preds = model.predict(X_test)

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("Regressor metrics on the test set:")
print(f"R2_Score: {r2_score(y_test, y_preds) * 100:.2f}%")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_preds)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_preds)}")

Regressor metrics on the test set:
R2_Score: 75.41%
Mean Absolute Error: 0.34855700000000006
Mean Squared Error: 0.18717184690000002


In [15]:
# Tuning Hyperparameters
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = dataset.drop('user_preference', axis=1)
y = dataset['user_preference']

# Hot encoding the instructor gender
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

hot_encoder = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', hot_encoder, ["Gender"])], remainder='passthrough')
X_transformed = transformer.fit_transform(X=X)
X_transformed = pd.DataFrame(X_transformed)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2)

In [18]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

model = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.9s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   2.8s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   3.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   3.6s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [19]:
rf_random.score(X_test, y_test)
y_preds = rf_random.predict(X_test)

In [20]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("Regressor metrics on the test set:")
print(f"R2_Score: {r2_score(y_test, y_preds) * 100:.2f}%")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_preds)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_preds)}")

Regressor metrics on the test set:
R2_Score: 76.06%
Mean Absolute Error: 0.3393027530856992
Mean Squared Error: 0.17789773085073565
