In [2]:
#########################
## NAME: PRACHETAS DESHPANDE
## DATA ANALYSIS PROJECT
#########################

In [3]:
## Read all the necessary files
import pandas as pd
import numpy as np
user_features_df = pd.read_csv("user_features.csv")
product_features_df = pd.read_csv("product_features.csv")
click_history_df = pd.read_csv("click_history.csv")
print(user_features_df.describe())
print(product_features_df.describe())
print(click_history_df.describe())

            user_id
count   12000.00000
mean   105999.50000
std      3464.24595
min    100000.00000
25%    102999.75000
50%    105999.50000
75%    108999.25000
max    111999.00000
        product_id  number_of_reviews  avg_review_score
count  1000.000000       1.000000e+03       1000.000000
mean   1499.500000       1.157725e+05          2.660656
std     288.819436       5.028997e+05          1.741875
min    1000.000000       6.600000e+01         -1.000000
25%    1249.750000       2.570000e+02          1.428969
50%    1499.500000       4.710000e+02          2.769397
75%    1749.250000       7.042500e+02          4.180860
max    1999.000000       2.307390e+06          5.000000
             user_id    product_id
count   35990.000000  35990.000000
mean   106017.080161   1500.232898
std      3483.480090    288.101984
min    100001.000000   1000.000000
25%    102976.500000   1250.000000
50%    106060.000000   1503.000000
75%    109049.000000   1749.000000
max    111999.000000   1999.000000


In [4]:
# Step 1: In user_features.csv there are some places where the values are empty. I am going to fill these empty values ith 0s
# About 10% of the data had records where one of the columns was empty
user_features_df=user_features_df.fillna(0)

# Step 2: In user_features.csv there is a column 'number_of_clicks_before' where the values are 6+ which are not proper integers. 
# Thus I'm assuming that these values are outliers and as a result I'm dropping those records from the dataframe
# About 25% of the data had records where of the columns had +6 as the number_of_clicks_before
user_features_df = user_features_df.drop(user_features_df[user_features_df['number_of_clicks_before'] == '6+'].index)

# Step 3: In product_features.csv file there are some records where the number of reviews are above 20000 while most other products have 
# the total number of reviews between 100 and 1000. These records where the number of reviews are inordinatily high are outliers and would end up rpoducing inacuurate results by the ML models
# Therefore, I am dropping those records where the number_of_reviews exceed 1000
# About 10% of the records had the number of reviews beyond 1000
product_features_df = product_features_df.drop(product_features_df[product_features_df['number_of_reviews'] > 1000].index)
product_features_df = product_features_df.drop(product_features_df[product_features_df['avg_review_score'] < 0].index)

# Step 4: The fourth step is to merge all the dataframe based on their user_id and product_id
final_df = pd.merge(user_features_df, click_history_df, on='user_id').merge(product_features_df, on='product_id')

# Step 5: The last step os to drop any columns that are not required for predictions to find the number of clicks. In this case personal interests and category are not relevant to predict the clicks
final_df.drop(['personal_interests','category'],axis=1,inplace=True)
print(final_df)

       user_id number_of_clicks_before  ordered_before  product_id  clicked  \
0       104939                       2            True        1212    False   
1       101992                       1            True        1212    False   
2       110175                       0            True        1212     True   
3       111017                       1           False        1212    False   
4       103186                       0           False        1212    False   
...        ...                     ...             ...         ...      ...   
26121   102329                       4            True        1287    False   
26122   100121                       0           False        1287     True   
26123   106853                       0            True        1287     True   
26124   109908                       0            True        1287     True   
26125   104387                       2            True        1287     True   

       on_sale  number_of_reviews  avg_review_score

In [5]:
## Splitting data into train and test sets
# The code below is splitting the data into train and test data. The train and test data have been split into a ratio of 0.7:0.3 as requested
from sklearn.model_selection import train_test_split
import sklearn.metrics
X = final_df.iloc[ : , final_df.columns != 'clicked']
y = final_df.clicked
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.30,shuffle=True,random_state=20)

In [6]:
## MODEL 1: LOGISTIC REGRESSION
# This code is using the logistic regresssion model and the train data and test data that was derived from the previous code and determining how well the model performs
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.exceptions import ConvergenceWarning
import numpy as np
import sklearn.metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, recall_score
warnings.filterwarnings("ignore")
lr = LogisticRegression(random_state=1)
lr.fit(X_train,y_train)
l_train_pred = lr.predict(X_train)
l_test_pred = lr.predict(X_test)
print("Accuracy score is :" + str(accuracy_score(y_test,l_test_pred)))
print("precision score is :" + str(precision_score(y_test,l_test_pred,zero_division=0)))
print("F1 score is :" + str(f1_score(y_test,l_test_pred,zero_division=0)))
print("recall score is :" + str(recall_score(y_test,l_test_pred,zero_division=0)))
print(sklearn.metrics.classification_report(y_test,l_test_pred))


Accuracy score is :0.6645828017351365
precision score is :0.0
F1 score is :0.0
recall score is :0.0
              precision    recall  f1-score   support

       False       0.66      1.00      0.80      5209
        True       0.00      0.00      0.00      2629

    accuracy                           0.66      7838
   macro avg       0.33      0.50      0.40      7838
weighted avg       0.44      0.66      0.53      7838



In [7]:
## MODEL 2: GAUSSIAN NB
# This code is using the gaussian nb model and the train data and test data that was derived from the previous code and determining how well the model performs
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, recall_score
nb = GaussianNB()
nb.fit(X_train,y_train)
nb_train_pred = nb.predict(X_train)
nb_test_pred = nb.predict(X_test)
print("Accuracy score is :" + str(accuracy_score(y_test,nb_test_pred)))
print("precision score is :" + str(precision_score(y_test,nb_test_pred)))
print("F1 score is :" + str(f1_score(y_test,nb_test_pred)))
print("recall score is :" + str(recall_score(y_test,nb_test_pred)))
print(sklearn.metrics.classification_report(y_test,nb_test_pred))

Accuracy score is :0.7217402398571064
precision score is :0.6408805031446541
F1 score is :0.48305285612704435
recall score is :0.38759984785089385
              precision    recall  f1-score   support

       False       0.74      0.89      0.81      5209
        True       0.64      0.39      0.48      2629

    accuracy                           0.72      7838
   macro avg       0.69      0.64      0.65      7838
weighted avg       0.71      0.72      0.70      7838



In [8]:
## MODEL 3: DECISION TREE
# This code is using the decision tree model and the train data and test data that was derived from the previous code to determining how well the model performs
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, recall_score
DT = tree.DecisionTreeClassifier(max_depth=10,min_samples_split=5,random_state=1)
DT.fit(X_train,y_train)
dt_train_pred = DT.predict(X_train)
dt_test_pred = DT.predict(X_test)
print("Accuracy score is: " + str(accuracy_score(y_test,dt_test_pred)))
print("precision score is: " + str(precision_score(y_test,dt_test_pred)))
print("F1 score is: " + str(f1_score(y_test,dt_test_pred)))
print("recall score is: " + str(recall_score(y_test,dt_test_pred)))
print(sklearn.metrics.classification_report(y_test,dt_test_pred))

Accuracy score is: 0.7438121969890278
precision score is: 0.6253532498990715
F1 score is: 0.6067371719545632
recall score is: 0.5891974134651959
              precision    recall  f1-score   support

       False       0.80      0.82      0.81      5209
        True       0.63      0.59      0.61      2629

    accuracy                           0.74      7838
   macro avg       0.71      0.71      0.71      7838
weighted avg       0.74      0.74      0.74      7838



In [9]:
## MODEL 4: NEURAL NETWORKING
# This code is using the neural network model and the train data and test data that was derived from the previous code to determining how well the model performs
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, recall_score
scalar = MinMaxScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)
NN= MLPClassifier(solver="lbfgs",alpha=1e-5,hidden_layer_sizes=(10,4),random_state=1)
NN.fit(X_train_scaled,y_train)
NN_pred = NN.predict(X_test_scaled)
print("Accuracy score is :" + str(accuracy_score(NN_pred,y_test)))
print("precision score is :" + str(precision_score(NN_pred,y_test)))
print("F1 score is :" + str(f1_score(NN_pred,y_test)))
print("recall score is :" + str(recall_score(NN_pred,y_test)))
print(sklearn.metrics.classification_report(y_test,NN_pred))


Accuracy score is :0.7612911457004338
precision score is :0.5781666032712058
F1 score is :0.6190185298309918
recall score is :0.6660823838737949
              precision    recall  f1-score   support

       False       0.80      0.85      0.83      5209
        True       0.67      0.58      0.62      2629

    accuracy                           0.76      7838
   macro avg       0.73      0.72      0.72      7838
weighted avg       0.76      0.76      0.76      7838



In [10]:
## MODEL 5: SUPPORT VECTOR MACHINES - LINEAR
from sklearn import svm
from sklearn import metrics
svm_linear = svm.SVC(kernel='linear')
svm_linear.fit(X_train,y_train)
svm_linear_pred = svm_linear.predict(X_test)
print("The accuracy score of linear kernel is: " + str(metrics.accuracy_score(y_test,svm_linear_pred)))
print("precision score is :" + str(precision_score(y_test,svm_linear_pred)))
print("F1 score is :" + str(f1_score(y_test,svm_linear_pred)))
print("recall score is :" + str(recall_score(y_test,svm_linear_pred)))

The accuracy score of linear kernel is: 0.6593518754784384
precision score is :0.14035087719298245
F1 score is :0.005956813104988831
recall score is :0.0030429821224800305


In [11]:
## MODEL 6: SUPPORT VECTOR MACHINES - RBF
from sklearn import svm
from sklearn import metrics
svm_rbf = svm.SVC(kernel='rbf')
svm_rbf.fit(X_train,y_train)
svm_rbf_pred = svm_rbf.predict(X_test)
print("The accuracy score of rbf kernel is: " + str(metrics.accuracy_score(y_test,svm_rbf_pred)))
print("precision score is :" + str(precision_score(y_test,svm_rbf_pred)))
print("F1 score is :" + str(f1_score(y_test,svm_rbf_pred)))
print("recall score is :" + str(recall_score(y_test,svm_rbf_pred)))

The accuracy score of rbf kernel is: 0.6645828017351365
precision score is :0.0
F1 score is :0.0
recall score is :0.0


In [12]:
## MODEL 7: SUPPORT VECTOR MACHINES - poly
from sklearn import svm
from sklearn import metrics
svm_poly = svm.SVC(kernel='poly')
svm_poly.fit(X_train,y_train)
svm_poly_pred = svm_poly.predict(X_test)
print("The accuracy score of poly kernel is: " + str(metrics.accuracy_score(y_test,svm_poly_pred)))
print("precision score is :" + str(precision_score(y_test,svm_poly_pred)))
print("F1 score is :" + str(f1_score(y_test,svm_poly_pred)))
print("recall score is :" + str(recall_score(y_test,svm_poly_pred)))

The accuracy score of poly kernel is: 0.6645828017351365
precision score is :0.0
F1 score is :0.0
recall score is :0.0


In [13]:
## MODEL 8: RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# create a random forest classifier
rf = RandomForestClassifier(random_state=0)

# define a range of values for n_estimators to try
n_estimators_values = [100, 200, 300, 400, 500, 600]

# perform 5-fold cross-validation for each value of n_estimators
cv_scores = []
for n_estimator in n_estimators_values:
    rf.set_params(n_estimators=n_estimator)
    scores = cross_val_score(rf, X_train, y_train, cv=10)
    cv_scores.append(np.mean(scores))

# find the best value of n_estimators
best_n_estimators = n_estimators_values[np.argmax(cv_scores)]

# train the final model using the best value of n_estimators
rf.set_params(n_estimators=best_n_estimators)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("The accuracy score is: " + str(metrics.accuracy_score(rf_pred,y_test)))
print("precision score is :" + str(precision_score(rf_pred,y_test)))
print("F1 score is :" + str(f1_score(rf_pred,y_test)))
print("recall score is :" + str(recall_score(rf_pred,y_test)))

The accuracy score is: 0.7241643276346007
precision score is :0.5424115633320654
F1 score is :0.5688073394495413
recall score is :0.5979035639412998


In [14]:
## MODEL 9: ADA BOOST CLASSIFIER
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# create a random forest classifier
AD = AdaBoostClassifier(random_state=0)

# define a range of values for n_estimators to try
n_estimators_values = [100, 200, 300, 400, 500, 600]

# perform 5-fold cross-validation for each value of n_estimators
cv_scores = []
for n_estimators in n_estimators_values:
    AD.set_params(n_estimators=n_estimators,learning_rate=0.05)
    scores = cross_val_score(AD, X_train, y_train)
    cv_scores.append(scores.mean())

# find the best value of n_estimators
best_n_estimators = n_estimators_values[np.argmax(cv_scores)]
# train the final model using the best value of n_estimators
AD.set_params(n_estimators=best_n_estimators)
AD.fit(X_train, y_train)
AD_pred = AD.predict(X_test)
print("The accuracy score is: " + str(metrics.accuracy_score(AD_pred,y_test)))
print("precision score is :" + str(precision_score(AD_pred,y_test)))
print("F1 score is :" + str(f1_score(AD_pred,y_test)))
print("recall score is :" + str(recall_score(AD_pred,y_test)))

The accuracy score is: 0.7633324827762185
precision score is :0.5793077215671358
F1 score is :0.6215058151397674
recall score is :0.6703345070422535
