In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from zipfile import ZipFile


In [48]:
# get the zip file
file = ZipFile('../adjs.zip')

# extract the adjs.csv
df = pd.read_csv(file.open("adjs.csv"), encoding='latin1')

df

Unnamed: 0,Stars,Review,adjectives,has_great,has_good,has_nice,has_other,has_delicious,has_friendly,has_little,has_amazing,has_new,has_fresh,has_bad,has_first,has_last,has_same,has_few,has_much
0,1,I got 'new' tires from them and within two wee...,"['flat', 'local', 'previous', 'new', 'resentfu...",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,All I can say is the worst! We were the only 2...,"['only', 'electronic', 'fish', 'filthy', 'slim...",0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2,1,I have been to this restaurant twice and was d...,"['first', 'empty', 'rude', 'Ridiculous', 'long...",0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,1,Food was NOT GOOD at all! My husband & I ate h...,"['first', 'huge', 'much', 'runny/watery', 'muc...",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
4,3,This is a tiny Starbucks and it locations like...,"['tiny', 'good', 'nice', 'central', 'favorite'...",0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33625,1,"Not worth the time, much less the price of adm...","['colorful', 'small', 'bitter', '\\nThe', 'sev...",0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
33626,1,Just wanted to write a review to chip in with ...,"['little', 'worth', 'accurate', 'same', 'cool'...",0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
33627,4,I have been to the other Monks locations so I ...,"['other', 'disappointed', 'different', 'fish',...",1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
33628,2,Don't go here. I know you might want to try i...,"['good', 'positive', '..', 'right', 'only', 'f...",0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [49]:
# initialize features
X   = df.iloc[:,3:].values
y   = df['Stars'].values     

y.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.3,  #set aside 30% of the data as the test set
                                                    random_state=7 #reproduce the result
                                                   )

In [50]:
# initialize Random Forest model and fit
rf = RandomForestClassifier(n_estimators=100, random_state=7, max_depth = 3)
rf.fit(X_train, y_train)

In [51]:
# predict
y_pred   = rf.predict(X_test)

In [52]:
# compute accuracy
accuracy_score(y_true = y_test, y_pred = y_pred)

0.3157894736842105

In [53]:
# initialize Decision Tree model and fit
dec_tree_clf = tree.DecisionTreeClassifier(max_depth = 3)
dec_tree_clf.fit(X_train, y_train)

In [54]:
# predict
y_pred = dec_tree_clf.predict(X_test);

In [55]:
# compute accuracy
accuracy_score(y_true = y_test, y_pred = y_pred)

0.2937853107344633

In [56]:
# number of neighbors is 10
k = 10

# initialize a knn_classifier
knn_classifier = KNeighborsClassifier(n_neighbors = k)

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    knn_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = knn_classifier.predict(x_test)

In [57]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.3463574189711567

##### HyperParameter Tuning for KNN

In [58]:
# list hyperparameters that we want to tune.
n_neighbors = list(range(1,30))

# cross validation method
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# Convert to dictionary
hyperparameters = dict(n_neighbors = n_neighbors)

# Create new KNN object

knn_2 = KNeighborsClassifier()

# Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv = cv, scoring='accuracy')


In [59]:
# fit the model
best_models = clf.fit(X,y)

In [60]:
# get the best model parameters
best_models.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 27,
 'p': 2,
 'weights': 'uniform'}

#### Create a new model with the best parameters

In [70]:
# initialize a knn_classifier
knn_classifier = KNeighborsClassifier(algorithm = 'auto', leaf_size = 30, metric = 'minkowski', metric_params = None, 
                                      n_jobs = None, n_neighbors =  27, p = 2, weights = 'uniform')

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    knn_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = knn_classifier.predict(x_test)

In [71]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.35804341361879277

##### HyperParameter Tuning for Decision Tree Classifier

In [72]:
# list hyperparameters that we want to tune.
max_depth = list(range(1,30))

# cross validation method
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# Convert to dictionary
hyperparameters = dict(max_depth = max_depth)

# Create new Decision Tree object

dt_2 = tree.DecisionTreeClassifier()

# Use GridSearch
clf = GridSearchCV(dt_2, hyperparameters, cv = cv, scoring='accuracy')


In [73]:
# fit the model
best_models = clf.fit(X,y)

In [74]:
# get the best model parameters
best_models.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [75]:
### Create a new model with the best parameters

# initialize a decision tree classifier
dt_classifier = tree.DecisionTreeClassifier(ccp_alpha = 0.0, class_weight = None, criterion = 'gini', 
                                            max_depth = 6, max_features = None, max_leaf_nodes = None, 
                                            min_impurity_decrease = 0.0, min_samples_leaf = 1, min_samples_split = 2, 
                                            min_weight_fraction_leaf = 0.0, random_state = None, splitter = 'best')

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    dt_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = dt_classifier.predict(x_test)

In [76]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.32694023193577165

##### HyperParameter Tuning for Random Forest Classifier

In [77]:
# list hyperparameters that we want to tune.
n_estimators = list(range(10, 150, 10))
max_depth = list(range(1,30))

# cross validation method
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# Convert to dictionary
hyperparameters = dict(n_estimators = n_estimators, max_depth = max_depth)

# Create new Random Forest Classifier object

rf_2 = RandomForestClassifier()

# Use GridSearch
clf = GridSearchCV(rf_2, hyperparameters, cv = cv, scoring='accuracy')


In [78]:
# fit the model
best_models = clf.fit(X,y)

In [79]:
# get the best model parameters
best_models.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 110,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [100]:
### Create a new model with the best parameters

# initialize a random forest classifier
rf_classifier = RandomForestClassifier(bootstrap = True, ccp_alpha = 0.0, class_weight = None, criterion = 'gini', 
                                       max_depth = 8, max_features = 'sqrt', max_leaf_nodes = None, 
                                       max_samples = None, min_impurity_decrease = 0.0, min_samples_leaf = 1,
                                       min_samples_split = 2, min_weight_fraction_leaf = 0.0,
                                       n_estimators = 110, n_jobs = None, oob_score = False, random_state = None,
                                       verbose = 0, warm_start = False)

# construction of kfold object
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

# allocate an empty array to store predictions in 
y_pred = np.empty_like(y)

for train_idx, test_idx in kfold.split(X, y):
    # build arrays which correspond to x, y train /test
    x_test = X[test_idx, :]
    x_train = X[train_idx, :]
    y_true_train = y[train_idx]
    


    # fit the classifier (use all available data)
    rf_classifier.fit(X, y)

    # estimate each review's star
    y_pred[test_idx] = rf_classifier.predict(x_test)

In [101]:
# compute accuracy
accuracy_score(y_true = y, y_pred = y_pred)

0.43462309542902966

## Based on HyperParameters Random Forest performed the best with .4346 accuracy