### Import packages

In [6]:
import numpy as np
import pandas as pd
import csv
import chardet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import random
from scipy import stats

# Model we generated

In [7]:
df = pd.read_csv('classifier_output_latest.csv')
df

Unnamed: 0.1,Unnamed: 0,Index,Type,Make,Model,Year,Review,Total_star,Safety,Technology,...,Value,Value_label,Size_label,Comfort_Drive_label,Interior_label,Appearance_Exterior_label,Power_Performance_label,Safety_label,Mpg_Efficiency_label,Maintanence_label
0,0,0,SUVs,volkswagen,taos,2022,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,5,4,...,5,1,1,0,1,1,0,0,1,0
1,1,2,SUVs,volkswagen,taos,2022,Recently purchased a Taos in the base trim (S)...,5,5,5,...,5,0,0,1,0,0,0,0,1,0
2,2,3,SUVs,volkswagen,taos,2022,This car feels premium and looks handsome. It...,5,0,5,...,5,1,0,1,0,1,0,0,0,0
3,3,4,SUVs,volkswagen,taos,2022,"Bought the White SEL, love everything about it...",1,1,3,...,2,0,0,0,0,1,0,1,0,-1
4,4,5,SUVs,volkswagen,taos,2022,"The FWD Taos S is a sporty, fun drive. It look...",5,4,4,...,5,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18444,18444,27642,Minivans,kia,carnival,2022,My wife and I recently retired and wanted a ne...,5,5,5,...,5,1,0,1,1,0,0,0,0,1
18445,18445,27643,Minivans,kia,carnival,2022,Just bought one yesterday in the SX trim and i...,5,5,5,...,5,0,0,0,0,0,0,0,0,0
18446,18446,27644,Minivans,kia,carnival,2022,It is great. Enough room for everyone plus car...,5,5,5,...,5,0,1,0,0,0,0,0,0,0
18447,18447,27645,Minivans,kia,carnival,2022,Other than UVO not available in my state is th...,3,5,5,...,3,0,0,0,0,0,0,0,0,0


In [8]:
X = df.iloc[:,15:24]
y = df["Total_star"]

In [9]:
X

Unnamed: 0,Value_label,Size_label,Comfort_Drive_label,Interior_label,Appearance_Exterior_label,Power_Performance_label,Safety_label,Mpg_Efficiency_label,Maintanence_label
0,1,1,0,1,1,0,0,1,0
1,0,0,1,0,0,0,0,1,0
2,1,0,1,0,1,0,0,0,0
3,0,0,0,0,1,0,1,0,-1
4,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
18444,1,0,1,1,0,0,0,0,1
18445,0,0,0,0,0,0,0,0,0
18446,0,1,0,0,0,0,0,0,0
18447,0,0,0,0,0,0,0,0,0


In [10]:
model = DecisionTreeRegressor(random_state=0)
params = {'max_depth':[i for i in range(3,30)]}
grid = GridSearchCV(model, param_grid = params, cv = 5)
grid.fit(X,y)
print(grid.best_params_)

y_pred = grid.predict(X)
R2 = grid.score(X,y)
MAE = mean_absolute_error(y, y_pred)
print("The R square of Drcision Tree Regression model is:", R2)
print("The MAE of Decision Tree Regression model is:", MAE)

{'max_depth': 8}
The R square of Drcision Tree Regression model is: 0.29226408485815536
The MAE of Decision Tree Regression model is: 0.8476653384845251


In [11]:
y_pred1 = np.rint(y_pred).astype(np.int64)
y_true = np.asarray(y)
confusion_matrix(y_true, y_pred1)

array([[  11,  742,  138,  695,   72],
       [   0,  385,  124,  594,  134],
       [   0,  302,  209,  845,  283],
       [   0,  178,  157, 1607, 1217],
       [   0,  180,  184, 5170, 5222]])

In [12]:
def evaluate_model (X,y):
    acc = accuracy_score(X, y)
    pre = precision_score(X, y, average='weighted')
    rec = recall_score(X, y, average='weighted')
    f1s = f1_score(X, y, average='weighted')
    return(acc, pre, rec, f1s)

In [13]:
model1_acc, model1_pre, model1_rec, model1_f1s = evaluate_model(y_true,y_pred1)
print(model1_acc, model1_pre, model1_rec, model1_f1s)

0.40294866930456935 0.5975078763908014 0.40294866930456935 0.42332483150334593


In [14]:
model1_acc = accuracy_score(y_true, y_pred1)
model1_acc

0.40294866930456935

In [15]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
print(precision_score(y_true, y_pred1, average='macro'))
print(recall_score(y_true, y_pred1, average='macro'))
print(f1_score(y_true, y_pred1, average='macro'))

0.48138516721062885
0.28791797897413957
0.25904484168707853


In [16]:
#Calculate metrics globally by counting the total true positives, false negatives and false positives.
print(precision_score(y_true, y_pred1, average='micro'))
print(recall_score(y_true, y_pred1, average='micro'))
print(f1_score(y_true, y_pred1, average='micro'))

0.40294866930456935
0.40294866930456935
0.40294866930456935


In [17]:
#Calculate metrics for each label, and find their average weighted by support 
#(the number of true instances for each label). 
#This alters ‘macro’ to account for label imbalance; 
#it can result in an F-score that is not between precision and recall.
print(precision_score(y_true, y_pred1, average='weighted'))
print(recall_score(y_true, y_pred1, average='weighted'))
print(f1_score(y_true, y_pred1, average='weighted'))

0.5975078763908014
0.40294866930456935
0.42332483150334593


In [18]:
category = ['Value_label','Size_label','Comfort_Drive_label','Interior_label','Appearance_Exterior_label','Power_Performance_label','Safety_label','Mpg_Efficiency_label','Maintanence_label']
random_dict = {}
for i in category:
    weight = df[i].value_counts(normalize = True)
    random_score = random.choices([0,1,-1],
                                  weights = [weight.iloc[0],weight.iloc[1],weight.iloc[2]],k=df.shape[0])
    random_dict[i] = random_score
    
df_random = pd.DataFrame(random_dict)
df_random

Unnamed: 0,Value_label,Size_label,Comfort_Drive_label,Interior_label,Appearance_Exterior_label,Power_Performance_label,Safety_label,Mpg_Efficiency_label,Maintanence_label
0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,1,1
2,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
18444,0,0,1,0,0,0,0,0,1
18445,0,0,1,0,0,1,0,0,0
18446,0,0,0,0,0,0,0,0,0
18447,1,1,0,-1,0,0,0,0,0


In [19]:
X = df_random.iloc[:,0:9]
y = df["Total_star"]

In [20]:
model = DecisionTreeRegressor(random_state=0)
params = {'max_depth':[i for i in range(3,30)]}
grid = GridSearchCV(model, param_grid = params, cv = 5)
grid.fit(X,y)
print(grid.best_params_)

y_pred = grid.predict(X)
R2 = grid.score(X,y)
MAE = mean_absolute_error(y, y_pred)
print("The R square of Drcision Tree Regression model is:", R2)
print("The MAE of Decision Tree Regression model is:", MAE)

{'max_depth': 3}
The R square of Drcision Tree Regression model is: 0.0016761869598874712
The MAE of Decision Tree Regression model is: 1.0594939808008579


In [21]:
y_pred = np.rint(y_pred).astype(np.int64)
y_true = np.asarray(y)
confusion_matrix(y_true, y_pred)

array([[    2,     2,     4,  1650,     0],
       [    0,     0,     5,  1232,     0],
       [    0,     1,     1,  1637,     0],
       [    0,     1,     6,  3152,     0],
       [    0,     0,     8, 10748,     0]])

In [22]:
model2_acc, model2_pre, model2_rec, model2_f1s = evaluate_model(y_true,y_pred)
print(model2_acc, model2_pre, model2_rec, model2_f1s)

0.17101197896904982 0.12287299480880902 0.17101197896904982 0.05034778621900876


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
stats.ttest_rel(y_pred1, y_pred)

Ttest_relResult(statistic=21.031519543686592, pvalue=4.645381618206501e-97)

# Model based on Edumund factors

In [24]:
df

Unnamed: 0.1,Unnamed: 0,Index,Type,Make,Model,Year,Review,Total_star,Safety,Technology,...,Value,Value_label,Size_label,Comfort_Drive_label,Interior_label,Appearance_Exterior_label,Power_Performance_label,Safety_label,Mpg_Efficiency_label,Maintanence_label
0,0,0,SUVs,volkswagen,taos,2022,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,5,4,...,5,1,1,0,1,1,0,0,1,0
1,1,2,SUVs,volkswagen,taos,2022,Recently purchased a Taos in the base trim (S)...,5,5,5,...,5,0,0,1,0,0,0,0,1,0
2,2,3,SUVs,volkswagen,taos,2022,This car feels premium and looks handsome. It...,5,0,5,...,5,1,0,1,0,1,0,0,0,0
3,3,4,SUVs,volkswagen,taos,2022,"Bought the White SEL, love everything about it...",1,1,3,...,2,0,0,0,0,1,0,1,0,-1
4,4,5,SUVs,volkswagen,taos,2022,"The FWD Taos S is a sporty, fun drive. It look...",5,4,4,...,5,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18444,18444,27642,Minivans,kia,carnival,2022,My wife and I recently retired and wanted a ne...,5,5,5,...,5,1,0,1,1,0,0,0,0,1
18445,18445,27643,Minivans,kia,carnival,2022,Just bought one yesterday in the SX trim and i...,5,5,5,...,5,0,0,0,0,0,0,0,0,0
18446,18446,27644,Minivans,kia,carnival,2022,It is great. Enough room for everyone plus car...,5,5,5,...,5,0,1,0,0,0,0,0,0,0
18447,18447,27645,Minivans,kia,carnival,2022,Other than UVO not available in my state is th...,3,5,5,...,3,0,0,0,0,0,0,0,0,0


In [25]:
X = df.iloc[:,8:15]
y = df["Total_star"]

In [26]:
y.value_counts()

5    10756
4     3159
1     1658
3     1639
2     1237
Name: Total_star, dtype: int64

In [27]:
X.describe()

Unnamed: 0,Safety,Technology,Performance,Interior,Comfort,Reliability,Value
count,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0,18449.0
mean,3.412109,3.226787,3.468643,3.454334,3.479647,3.100005,3.060274
std,2.113483,2.061752,1.966299,2.00487,1.975602,2.193611,2.106794
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,2.0,2.0,2.0,0.0,0.0
50%,5.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [28]:
model = DecisionTreeRegressor(random_state=0)
params = {'max_depth':[i for i in range(3,30)]}
grid = GridSearchCV(model, param_grid = params, cv = 5)
grid.fit(X,y)
print(grid.best_params_)

y_pred = grid.predict(X)
R2 = grid.score(X,y)
MAE = mean_absolute_error(y, y_pred)
print("The R square of Drcision Tree Regression model is:", R2)
print("The MAE of Decision Tree Regression model is:", MAE)

{'max_depth': 7}
The R square of Drcision Tree Regression model is: 0.6680110866675395
The MAE of Decision Tree Regression model is: 0.4726547338070822


In [29]:
y_pred = np.rint(y_pred).astype(np.int64)
y_true = np.asarray(y)
confusion_matrix(y_true, y_pred)

array([[ 816,  386,   89,  352,   15],
       [ 150,  490,  233,  343,   21],
       [  21,  270,  566,  700,   82],
       [   2,   28,  280, 1909,  940],
       [   1,   12,   75, 2155, 8513]])

In [30]:
model4_acc, model4_pre, model4_rec, model4_f1s = evaluate_model(y_true,y_pred)
print(model4_acc, model4_pre, model4_rec, model4_f1s)

0.6663775814407285 0.7206724456763008 0.6663775814407285 0.6815943678606567


In [31]:
stats.ttest_rel(y_pred1, y_pred)

Ttest_relResult(statistic=-3.2095305623420565, pvalue=0.0013317934563494916)

# All columns in Edumund except year

In [52]:
df2 = pd.get_dummies(data=df, columns=['Type', 'Make','Model'])
df2 = df2.drop(columns=['Value_label','Size_label','Comfort_Drive_label','Interior_label','Appearance_Exterior_label','Power_Performance_label','Safety_label','Mpg_Efficiency_label','Maintanence_label'])
df2

Unnamed: 0.1,Unnamed: 0,Index,Year,Review,Total_star,Safety,Technology,Performance,Interior,Comfort,...,Model_xc40,Model_xc60,Model_xc90,Model_xe,Model_xt4,Model_xt5,Model_xt6,Model_yaris,Model_yukon,Model_z4
0,0,0,2022,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,5,4,4,4,5,...,0,0,0,0,0,0,0,0,0,0
1,1,2,2022,Recently purchased a Taos in the base trim (S)...,5,5,5,5,3,4,...,0,0,0,0,0,0,0,0,0,0
2,2,3,2022,This car feels premium and looks handsome. It...,5,0,5,5,5,5,...,0,0,0,0,0,0,0,0,0,0
3,3,4,2022,"Bought the White SEL, love everything about it...",1,1,3,3,4,4,...,0,0,0,0,0,0,0,0,0,0
4,4,5,2022,"The FWD Taos S is a sporty, fun drive. It look...",5,4,4,5,4,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18444,18444,27642,2022,My wife and I recently retired and wanted a ne...,5,5,5,5,5,5,...,0,0,0,0,0,0,0,0,0,0
18445,18445,27643,2022,Just bought one yesterday in the SX trim and i...,5,5,5,5,5,5,...,0,0,0,0,0,0,0,0,0,0
18446,18446,27644,2022,It is great. Enough room for everyone plus car...,5,5,5,5,5,5,...,0,0,0,0,0,0,0,0,0,0
18447,18447,27645,2022,Other than UVO not available in my state is th...,3,5,5,5,5,5,...,0,0,0,0,0,0,0,0,0,0


In [53]:
X = df2.iloc[:,5:315]
y = df2["Total_star"]

In [54]:
model = DecisionTreeRegressor(random_state=0)
params = {'max_depth':[i for i in range(3,30)]}
grid = GridSearchCV(model, param_grid = params, cv = 5)
grid.fit(X,y)
print(grid.best_params_)

y_pred = grid.predict(X)
R2 = grid.score(X,y)
MAE = mean_absolute_error(y, y_pred)
print("The R square of Drcision Tree Regression model is:", R2)
print("The MAE of Decision Tree Regression model is:", MAE)

{'max_depth': 7}
The R square of Drcision Tree Regression model is: 0.6714462480118661
The MAE of Decision Tree Regression model is: 0.47539076389914453


In [55]:
y_pred = np.rint(y_pred).astype(np.int64)
y_true = np.asarray(y)
confusion_matrix(y_true, y_pred)

array([[ 815,  389,  114,  324,   16],
       [ 146,  487,  237,  348,   19],
       [  17,  264,  554,  726,   78],
       [   2,   26,  277, 1933,  921],
       [   0,    7,  106, 2252, 8391]])

In [56]:
model3_acc, model3_pre, model3_rec, model3_f1s = evaluate_model(y_true,y_pred)
print(model3_acc, model3_pre, model3_rec, model3_f1s)

0.6601983847363001 0.7191235009550382 0.6601983847363001 0.676798701887198


In [57]:
stats.ttest_rel(y_pred1, y_pred)

Ttest_relResult(statistic=-2.309080008019058, pvalue=0.020950135487802016)

# TF-IDF model

In [58]:
X_sample = df['Review']
X_sample

0        I recently traded in my 2017 Honda HR-V in "Ba...
1        Recently purchased a Taos in the base trim (S)...
2        This car feels premium and looks handsome.  It...
3        Bought the White SEL, love everything about it...
4        The FWD Taos S is a sporty, fun drive. It look...
                               ...                        
18444    My wife and I recently retired and wanted a ne...
18445    Just bought one yesterday in the SX trim and i...
18446    It is great. Enough room for everyone plus car...
18447    Other than UVO not available in my state is th...
18448                      Drives smoothly. Plenty of room
Name: Review, Length: 18449, dtype: object

In [59]:
vectorizer = TfidfVectorizer()
review_vector = vectorizer.fit_transform(X_sample).todense()

In [60]:
# This step will take several minutes to run
KM = KMeans(n_clusters = 8, random_state = 10)
clusters = KM.fit_predict(review_vector)
clusters = pd.Series(clusters, name = 'cluster')

In [61]:
review_vector.shape

(18449, 24986)

In [62]:
dfc = pd.DataFrame(review_vector)
dfc = pd.concat([dfc,clusters],axis = 1)

In [63]:
dfc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24977,24978,24979,24980,24981,24982,24983,24984,24985,cluster
0,0.0,0.109258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18444,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
18445,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
18446,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
18447,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [64]:
X = dfc.iloc[:,0:24987]
y = df["Total_star"]

In [65]:
# I change the max_depth range from 3-30 to 7-8 because it takes hours to run
model = DecisionTreeRegressor(random_state=0)
params = {'max_depth':[i for i in range(7,8)]}
grid = GridSearchCV(model, param_grid = params, cv = 5)
grid.fit(X,y)
print(grid.best_params_)
y_pred = grid.predict(X)
R2 = grid.score(X,y)
MAE = mean_absolute_error(y, y_pred)
print("The R square of Drcision Tree Regression model is:", R2)
print("The MAE of Decision Tree Regression model is:", MAE)

{'max_depth': 7}
The R square of Drcision Tree Regression model is: 0.37882460153920505
The MAE of Decision Tree Regression model is: 0.7805722263365299


In [66]:
y_pred = np.rint(y_pred).astype(np.int64)
y_true = np.asarray(y)
confusion_matrix(y_true, y_pred)

array([[ 204,  792,   67,  315,  280],
       [  36,  488,   92,  362,  259],
       [  15,  397,  104,  679,  444],
       [   1,  222,   71, 1211, 1654],
       [   0,  144,   80, 2716, 7816]])

In [67]:
model4_acc, model4_pre, model4_rec, model4_f1s = evaluate_model(y_true,y_pred)
print(model4_acc, model4_pre, model4_rec, model4_f1s)

0.5324407826982492 0.5851324962047044 0.5324407826982492 0.5269415015834539


In [68]:
stats.ttest_rel(y_pred1, y_pred)

Ttest_relResult(statistic=-21.10772371616324, pvalue=9.651166055779857e-98)