In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

### Load and preprocessing the manual label data

In [None]:
df_all = pd.read_excel("/Users/mengkaiqi/Documents/ISE 540/project/manual_labelling_data.xlsx",sheet_name=None)
df = pd.concat(df_all, ignore_index=True)
df.index = df["Row_number"]
df = df.drop(columns = ['Row_number'])
df.head()

In [None]:
df.dtypes
# manully fill the two empty rows

Review                 object
Total_star              int64
Value                   int64
Size                    int64
Comfort_Drive           int64
Interior                int64
Appearance_Exterior     int64
Power_Performance       int64
Safety                  int64
Mpg_Efficiency          int64
Maintanence             int64
dtype: object

In [None]:
df.info()
#df_a = pd.read_csv("/Users/mengkaiqi/Documents/ISE 540/project/data_cleaned.csv")
#df_a["indexx"] = df_a.index
#df_a
#df_b = df_a.drop(columns = ['Unnamed: 0','Unnamed: 2'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 14823 to 12006
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Review               1000 non-null   object
 1   Total_star           1000 non-null   int64 
 2   Value                1000 non-null   int64 
 3   Size                 1000 non-null   int64 
 4   Comfort_Drive        1000 non-null   int64 
 5   Interior             1000 non-null   int64 
 6   Appearance_Exterior  1000 non-null   int64 
 7   Power_Performance    1000 non-null   int64 
 8   Safety               1000 non-null   int64 
 9   Mpg_Efficiency       1000 non-null   int64 
 10  Maintanence          1000 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 93.8+ KB


### Load the test data

In [None]:
df_a = pd.read_csv("/Users/mengkaiqi/Documents/ISE 540/project/data_cleaned.csv")
df_test = df_a.loc[:,["Review"]]
df_test.shape

(18449, 1)

In [None]:
df_cc = df_test.tail(1000)
df_cc

Unnamed: 0,Review
17449,Bought this car new in April 2015. Will save ...
17450,"It’s a strikingly beautiful car, mine is Subli..."
17451,I love this 'new to me' Challenger R/T Plus HE...
17452,This car is extremely dangerous. We purchased...
17453,Very great cars to buy!
...,...
18444,My wife and I recently retired and wanted a ne...
18445,Just bought one yesterday in the SX trim and i...
18446,It is great. Enough room for everyone plus car...
18447,Other than UVO not available in my state is th...


### Cluster Model

In [None]:
# RandomForest
tfidf_vectorizer_sub = TfidfVectorizer()
rf = RandomForestClassifier(max_features = 7075, max_depth = 50, n_estimators = 1000)  # initiate the classifier
tfidf_rf_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('rf', rf)])  # build pipeline
tfidf_rf_pipe.fit(df.Review, df.Value)  # train the random forest classifier
# predict and evaluate based on the test set
y_sub_preds_rf = tfidf_rf_pipe.predict(df_test.Review)
np.unique(y_sub_preds_rf, return_counts=True)

(array([-1,  0,  1]), array([   33, 16522,  1894]))

In [None]:
# 4-folds cross validation
scores = cross_val_score(tfidf_rf_pipe, df.Review, df.Value,
                             scoring="accuracy", cv=4)
scores.mean()

# kfold = model_selection.KFold(n_splits=4, random_state=7, shuffle=True)
# results = model_selection.cross_val_score(RandomForestClassifier(), df.Review, df.Value, cv=kfold, scoring='accuracy')
# results

0.857


In [None]:
# KNN
tfidf_vectorizer_sub = TfidfVectorizer()
knn = KNeighborsClassifier()  # initiate the classifier
tfidf_lr_sub_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('knn', knn)])  # build pipeline
tfidf_lr_sub_pipe.fit(df.Review, df.Value)  # train the logistic classifier
# predict and evaluate based on the test set
y_sub_preds_lr = tfidf_lr_sub_pipe.predict(df_test.Review)
np.unique(y_sub_preds_lr, return_counts=True)

(array(['-1.0', '0.0', '1.0'], dtype=object), array([   38, 17904,   507]))

In [None]:
scores = cross_val_score(tfidf_lr_sub_pipe, df.Review, df.Value,
                             scoring="accuracy", cv=4)
scores.mean()



0.846

In [None]:
# MultinomialNB
tfidf_vectorizer_sub = TfidfVectorizer()
mnb = MultinomialNB()  # initiate the classifier
tfidf_lr_sub_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('mnb', mnb)])  # build pipeline
tfidf_lr_sub_pipe.fit(df.Review, df.Value)  # train the logistic classifier
# predict and evaluate based on the test set
y_sub_preds_lr = tfidf_lr_sub_pipe.predict(df_test.Review)
np.unique(y_sub_preds_lr, return_counts=True)

(array(['0.0'], dtype='<U4'), array([18449]))

In [None]:
scores = cross_val_score(tfidf_lr_sub_pipe, df.Review, df.Value,
                             scoring="accuracy", cv=4)
scores.mean()



0.851

In [None]:
# SVM 
tfidf_vectorizer_sub = TfidfVectorizer()
rbf = svm.SVC(kernel='poly', degree=3, C=1)
tfidf_lr_sub_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('rbf', rbf)])  # build pipeline
tfidf_lr_sub_pipe.fit(df.Review, df.Value)  # train the logistic classifier
# predict and evaluate based on the test set
y_sub_preds_lr = tfidf_lr_sub_pipe.predict(df_test.Review)
np.unique(y_sub_preds_lr, return_counts=True)

(array(['-1.0', '0.0', '1.0', 'nan'], dtype=object),
 array([   30, 18298,   119,     2]))

In [None]:
scores = cross_val_score(tfidf_lr_sub_pipe, df.Review, df.Value,
                             scoring="accuracy", cv=4)
scores.mean()



0.851

### Predict

In [None]:
# RandomForest
tfidf_vectorizer_sub = TfidfVectorizer()
rf = RandomForestClassifier(max_features = 7075, max_depth = 50, n_estimators = 1000)  # initiate the classifier
tfidf_rf_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('rf', rf)])  # build pipeline
tfidf_rf_pipe.fit(df.Review, df.Value)  # train the random forest classifier
# predict and evaluate based on the test set
y_sub_preds_rf = tfidf_rf_pipe.predict(df_test.Review)
np.unique(y_sub_preds_rf, return_counts=True)

In [None]:
table.head()

Unnamed: 0,Review,Total_star,Value,Size,Comfort_Drive,Interior,Appearance_Exterior,Power_Performance,Safety,Mpg_Efficiency,Maintanence
0,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,,,,,,,,,
1,Recently purchased a Taos in the base trim (S)...,5,,,,,,,,,
2,This car feels premium and looks handsome. It...,5,,,,,,,,,
3,"Bought the White SEL, love everything about it...",1,,,,,,,,,
4,"The FWD Taos S is a sporty, fun drive. It look...",5,,,,,,,,,


In [None]:
table = pd.DataFrame(columns=df.columns)
table['Review'] = df_test.Review
table['Total_star'] = df_a.Total_star


In [None]:
for i in range(2,11):
    tfidf_vectorizer_sub = TfidfVectorizer()
    rf = RandomForestClassifier(max_features = 7075, max_depth = 50)  # initiate the classifier
    tfidf_rf_pipe = Pipeline([('tfidf', tfidf_vectorizer_sub), ('rf', rf)])  # build pipeline
    tfidf_rf_pipe.fit(df.Review, df.iloc[:,i])  # train the random forest classifier
    # predict and evaluate based on the test set
    y_sub_preds_rf = tfidf_rf_pipe.predict(df_test.Review)
    
    table.iloc[:,i] = y_sub_preds_rf

In [None]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18449 entries, 0 to 18448
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Review               18449 non-null  object
 1   Total_star           18449 non-null  int64 
 2   Value                18449 non-null  int64 
 3   Size                 18449 non-null  int64 
 4   Comfort_Drive        18449 non-null  int64 
 5   Interior             18449 non-null  int64 
 6   Appearance_Exterior  18449 non-null  int64 
 7   Power_Performance    18449 non-null  int64 
 8   Safety               18449 non-null  int64 
 9   Mpg_Efficiency       18449 non-null  int64 
 10  Maintanence          18449 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 1.5+ MB


In [None]:
table

Unnamed: 0,Review,Total_star,Value,Size,Comfort_Drive,Interior,Appearance_Exterior,Power_Performance,Safety,Mpg_Efficiency,Maintanence
0,"I recently traded in my 2017 Honda HR-V in ""Ba...",5,1,1,0,1,1,0,0,1,0
1,Recently purchased a Taos in the base trim (S)...,5,0,0,1,0,0,0,0,1,0
2,This car feels premium and looks handsome. It...,5,1,0,1,0,1,0,0,0,0
3,"Bought the White SEL, love everything about it...",1,0,0,0,0,1,0,1,0,-1
4,"The FWD Taos S is a sporty, fun drive. It look...",5,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
18444,My wife and I recently retired and wanted a ne...,5,1,0,1,1,0,0,0,0,1
18445,Just bought one yesterday in the SX trim and i...,5,0,0,0,0,0,0,0,0,0
18446,It is great. Enough room for everyone plus car...,5,0,1,0,0,0,0,0,0,0
18447,Other than UVO not available in my state is th...,3,0,0,0,0,0,0,0,0,0


In [None]:
file_name = '/Users/mengkaiqi/Documents/ISE 540/project/classifier_output.csv'
table.to_csv(file_name, index=False, header=True)