**Project: Restaurant Rating Predictor**

**Name:**    `Shelanah Rahman`


In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import pickle
import scipy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
#Stacking Classifier algorithm that takes input of metaclassifier and list of baseclassifiers
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    

In [8]:
#Save list of Class
X_train_meta = pd.read_csv("review_meta_train.csv")
y = X_train_meta.iloc[:,-1]


In [10]:
# Open converted data
vocab = pickle.load(open("review_text_features_countvec/train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
X= scipy.sparse.load_npz('review_text_features_countvec/review_text_train_vec.npz')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
# Save Test data to predict class 
X_pred= scipy.sparse.load_npz('review_text_features_countvec/review_text_test_vec.npz')

In [11]:
#### find top 1000 features
k=1000
x2 = SelectKBest(chi2, k=k)
X_train_x2 = x2.fit_transform(X_train, y_train)
X_test_x2 = x2.transform(X_test)
print("Training size: ", X_train_x2.shape[0], "\nTesting size:", X_test_x2.shape[0])

Training size:  16840 
Testing size: 11228


In [12]:
#Save base classifiers and print the accuracy out
models = [MultinomialNB(),
         LogisticRegression()]
titles = ['MNB',
         'Logistic Regression']


for title, model in zip(titles, models):
    print('\n',title, '(with k=',k,'features):')
    model.fit(X_train_x2.todense(), y_train)
    acc = model.score(X_test_x2.todense(), y_test)
    print('x2', '\t acc',  acc)
    



 MNB (with k= 1000 features):
x2 	 acc 0.8362130388314927

 Logistic Regression (with k= 1000 features):




x2 	 acc 0.84244745279658


In [13]:
#print the accuracy of a meta classifiers
meta_classifiers = [DecisionTreeClassifier(),
                    GaussianNB()]
meta_titles= ['\nDecision Tree', '\nGaussian Naive Bayes']
y_pred_lst = []
for meta_classifier, meta_title in zip(meta_classifiers, meta_titles):
    stacker = StackingClassifier(models, meta_classifier)
    stacker.fit(X_train.todense(), y_train)
    print(meta_title, '\nStacker Accuracy:', stacker.score(X_test.todense(), y_test))
    #predict the classes of the testing data
    y_pred = stacker.predict(X_pred.todense())
    y_pred= y_pred.tolist()
    y_pred_lst.append(y_pred)




Decision Tree 
Stacker Accuracy: 0.8288208051300321





Gaussian Naive Bayes 
Stacker Accuracy: 0.8341645885286783


In [14]:
#save class list into a file
inst_id_col= []
for i in range(1,len(y_pred_lst[0])+1):
    inst_id_col.append(i)
df= pd.DataFrame({'Instance_id': inst_id_col, 'rating': y_pred_lst[0]})
df.to_csv('rating_predictions_dec.csv', index=False)
df= pd.DataFrame({'Instance_id': inst_id_col, 'rating': y_pred_lst[1]})
df.to_csv('rating_predictions_NB.csv', index=False)