# **Pawpularity Score Prediction using Regression Models**


# **Importing Required Libraries**

In [None]:
import numpy as np
import pandas as pd 

import random 

from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor # used for prediction 
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# **Model Validation: Holdout**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/train.csv')

In [None]:
data.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [None]:
data.shape

(9912, 14)

In [None]:
X= data[data.columns[1:-1]] # other features 
y= data["Pawpularity"] # Pawpularity

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=False)

In [None]:
X_train.shape , X_test.shape

((8920, 12), (992, 12))

# **Machine Learning Models**

**RandomForest Regressor without RandomSearch**

In [None]:
rf = RandomForestRegressor(max_features='sqrt',n_estimators=100, max_depth=5,min_samples_split=4)

In [None]:
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_split=4)

In [None]:
prediction_rf = rf.predict(X_test)
rf_model_RMSE = np.sqrt(mean_squared_error(y_test, prediction_rf))

print(rf_model_RMSE)

20.25172489017028


**RF with RandomSearchCV**

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 15)]


# Number of features to consider at every split
max_features = ['auto', 'sqrt']


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
 
# Minimum number of samples required to split a node
min_samples_split = [5, 10 , 15, 20 , 25]


# Minimum number of samples required at each leaf node
min_samples_leaf = [5, 10, 15]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [5, 10, 15, 20, 25], 'min_samples_leaf': [5, 10, 15]}


In [None]:
# Random search of parameters, using 5 fold cross validation, 
 
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, # Dictionary with parameters names (str) as keys and distributions or lists of parameters to try
                               scoring='neg_mean_squared_error', #  to evaluate the performance of the cross-validated model on the test set.
                               n_iter = 10, 
                               cv = 4, 
                               refit = True, # Refit an estimator using the best found parameters on the whole dataset.
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1 # Number of jobs to run in parallel. -1 means using all processors 
                              )

In [None]:

rf_random.fit(X_train,y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


RandomizedSearchCV(cv=4,
                   estimator=RandomForestRegressor(max_depth=5,
                                                   max_features='sqrt',
                                                   min_samples_split=4),
                   n_jobs=-1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [5, 10, 15],
                                        'min_samples_split': [5, 10, 15, 20,
                                                              25],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100, 1200,
                                                         1300, 1400, 1500]},
                   random_state=42, scoring='neg_mean_sq

In [None]:
# Best parameters choosen 

rf_random.best_params_

{'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 15,
 'n_estimators': 100}

In [None]:
# Get best score ( neg_mean_squared_error )

rf_random.best_score_ 

-425.2721426274963

In [None]:
predictions_X_test_rf = rf_random.predict(X_test)
RMSE_model1_RfR = np.sqrt(mean_squared_error(y_test, predictions_X_test_rf))

print(RMSE_model1_RfR)

20.25923642483292


**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression as lin

In [None]:
lin_reg = lin(normalize=True,fit_intercept=True)

In [None]:
lin_reg.fit(X_train,y_train)

LinearRegression(normalize=True)

In [None]:
predictions_X_test_lin_reg = lin_reg.predict(X_test)
lin_reg_model_RMSE = np.sqrt(mean_squared_error(y_test, predictions_X_test_lin_reg))

print(lin_reg_model_RMSE)

20.297186404781844


**Support Vector Regressor with RandomSearchCV**

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV as rscv

In [None]:
svm = SVR()

In [None]:
kernel = ['poly','sigmoid','rbf']
c = [0.01,0.1,1,10]
gamma = [0.01,0.1,1]
epsilon = [0.01,0.1,1]
shrinking = [True,False]
svm_grid = {'kernel':kernel,'C':c,'gamma':gamma,'epsilon':epsilon,'shrinking':shrinking}
svm_search = rscv(svm,svm_grid,scoring='neg_mean_squared_error',cv=3,return_train_score=True,n_jobs=-1,n_iter=40,verbose=1)
svm_search.fit(X_train,y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


RandomizedSearchCV(cv=3, estimator=SVR(), n_iter=40, n_jobs=-1,
                   param_distributions={'C': [0.01, 0.1, 1, 10],
                                        'epsilon': [0.01, 0.1, 1],
                                        'gamma': [0.01, 0.1, 1],
                                        'kernel': ['poly', 'sigmoid', 'rbf'],
                                        'shrinking': [True, False]},
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=1)

In [None]:
svm_search.best_params_

{'C': 10, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'sigmoid', 'shrinking': False}

In [None]:
svm_confirmed = svm_search.best_estimator_
svm_pred = svm_confirmed.predict(X_test)

In [None]:
SVR_model_RMSE = np.sqrt(mean_squared_error(y_test, svm_pred))

print(SVR_model_RMSE)

20.817459989316166


**LassoLARS**

In [None]:
from sklearn.linear_model import LassoLars as las

In [None]:
laso_model = las(normalize=True,fit_intercept=True)
laso_model.fit(X_train,y_train)
laso_pred = laso_model.predict(X_test)


In [None]:
lasso_model_RMSE = np.sqrt(mean_squared_error(y_test, laso_pred))

print(lasso_model_RMSE)

20.27801860667925


**Bayesian Ridge**

In [None]:
from sklearn.linear_model import BayesianRidge as br

In [None]:
br_model = br(normalize=True,alpha_init=0.01)
br_model.fit(X_train,y_train)
br_pred = br_model.predict(X_test)

In [None]:
br_model_RMSE = np.sqrt(mean_squared_error(y_test, br_pred))

print(br_model_RMSE)

20.280401677821775


**SVR Random Hyperparameters**

In [None]:
svr_model = SVR(C=100,epsilon=0.001, gamma=0.01, kernel='rbf', coef0=100 )

In [None]:
svr_model.fit(X_train,y_train)
svr_pred = svr_model.predict(X_test)

In [None]:
SVR2_model_RMSE = np.sqrt(mean_squared_error(y_test, svr_pred))

print(SVR2_model_RMSE)

20.79487582280616


**Decision Tree Regressor**

In [None]:
from sklearn import tree

In [None]:
clf = tree.DecisionTreeRegressor(max_depth=2)
clf = clf.fit(X_train, y_train)
dtr_pred = clf.predict(X_test)


In [None]:
dtr_model_RMSE = np.sqrt(mean_squared_error(y_test, dtr_pred))

print(dtr_model_RMSE)

20.277137865860233


# **Feature Selection**

In [None]:
#Import Libraries

#feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


**Selecting the best 5 Features**

In [None]:

bestfeatures = SelectKBest(score_func=chi2, k='all')
fit=bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['feature','Score']
print(featureScores.nlargest(5,'Score')) 

      feature       Score
4      Action  110.521692
10       Info  110.037115
6       Group  107.661366
9   Occlusion  106.320074
7     Collage   98.043067


In [None]:
data.columns

Index(['Id', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur',
       'Pawpularity'],
      dtype='object')

**Dropping Least Scored Features**

In [None]:
df = data.drop(columns=['Id','Eyes','Face','Near','Accessory','Human','Subject Focus','Blur']) 

In [None]:

df.head()

Unnamed: 0,Action,Group,Collage,Occlusion,Info,Pawpularity
0,0,1,0,0,0,63
1,0,0,0,0,0,42
2,0,0,0,1,0,28
3,0,0,0,0,0,15
4,0,1,0,0,0,72


In [None]:
df.shape

(9912, 6)

In [None]:
xf = df[df.columns[:-1]]
yf = df['Pawpularity']

In [None]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(xf, yf, test_size=0.10)

# **Implementation of Models with Selected Features Dataset**

**Linear Regression with Feature Selection**

In [None]:

lin_reg.fit(Xf_train,yf_train)
lin_fs_pred = lin_reg.predict(Xf_test)

In [None]:
lin_fs_RMSE = np.sqrt(mean_squared_error(yf_test, lin_fs_pred))

print(lin_fs_RMSE)

19.62682519268789


**Decision Tree with Feature Selection**

In [None]:

dtr_model = dtr.fit(Xf_train, yf_train)
dtrf2_pred = dtr.predict(Xf_test)

In [None]:
dtrf2_model_RMSE = np.sqrt(mean_squared_error(yf_test, dtrf2_pred))

print(dtrf2_model_RMSE)

19.620032780995736


**SVR with Feature Selection**

In [None]:

svr_model.fit(Xf_train,yf_train)
svr_pred = svr_model.predict(Xf_test)

In [None]:
SVR2_model_RMSE = np.sqrt(mean_squared_error(yf_test, svr_pred))

print(SVR2_model_RMSE)

19.98880872301359


**LassoLARS with Feature Selection**

In [None]:

laso_model.fit(Xf_train,yf_train)
laso_pred = laso_model.predict(Xf_test)

In [None]:
lasso_model_RMSE = np.sqrt(mean_squared_error(yf_test, laso_pred))

print(lasso_model_RMSE)

19.626318348582824


**Random Forest with Feature Selection**

In [None]:

rf.fit(Xf_train,yf_train)

RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_split=4)

In [None]:
prediction_rf = rf.predict(Xf_test)
rf_model_RMSE = np.sqrt(mean_squared_error(yf_test, prediction_rf))

print(rf_model_RMSE)

19.63409012505495
