In [1]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import joblib, string, os
import numpy as np

os.chdir(r'C:\Users\user\Desktop\Revature\Projects\Yelp\stacked')
path = r'C:\Users\user\Desktop\Revature\Projects\Yelp\yelp_reviews_sample.csv'

df = pd.read_csv(path)

stop_words = set(stopwords.words('english')) #'if', 'and', 'the', etc.

def preprocess(text):
    translation = str.maketrans('', '', string.punctuation)
    text = text.translate(translation)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['text'] = df.apply(lambda row: preprocess(row['text']), axis=1)

print(df.head())

filenames = ['linearSVC.joblib', 'NB_model.joblib', 'SGD_class.joblib']


Unnamed: 0               review_id                 user_id  \
0           0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA   
1           1  GJXCdrto3ASJOqKeVWPi6Q  yXQM5uF2jS6es16SJzNHfg   
2           2  2TzJjDVDEuAW6MR5Vuc1ug  n6-Gk65cPZL6Uz8qRm3NYw   
3           3  yi0R0Ugj_xUx_Nek0-_Qig  dacAIZ6fTM6mqwW5uxkskg   
4           4  11a8sVPMUFtaC7_ABRkmtw  ssoyf2_x0EQMed6fgHeMyQ   

              business_id  stars  useful  funny  cool  \
0  ujmEBvifdJM6h6RLv4wQIg    1.0       6      1     0   
1  NZnhc2sEQy3RmzKTZnqtwQ    5.0       0      0     0   
2  WTqjgwHlXbSFevF32_DJVw    5.0       3      0     0   
3  ikCg8xy5JIg_NGPx-MSIDA    5.0       0      0     0   
4  b1b1eb3uo-w561D0ZfCEiQ    1.0       7      0     0   

                                                text                 date  
0  total bill horrible service 8gs crooks actuall...  2013-05-07 04:34:36  
1  adore travis hard rocks new kelly cardenas sal...  2017-01-14 21:30:33  
2  say office really together organized f

In [2]:
## Loading the models ##
linearSVC = joblib.load('linearSVC.joblib')
NB_model = joblib.load('NB_model.joblib')
SGD_class = joblib.load('SGD_class.joblib')
#vectors = joblib.load('vectors.joblib')

models = [linearSVC, NB_model, SGD_class]

## Vectorizing again - ideally we load the old vectorizer in production ##
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df.text)
df_test = vectorizer.transform(df.text)

## Doing a train-test split ##
x = df.text
x = vectorizer.transform(x) #preparing the data
y = df.stars
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2)

In [3]:
## Testing the models ##
for m in models:
    m.fit(x_train, y_train)
    predictions = m.predict(x_test)
    print(m, '\n')
    print(accuracy_score(y_test, predictions))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 

0.6409
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 

0.49595
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) 

0.6295


In [4]:
def stack_predict(models, X):
    '''Making predictions with each model.'''
    predictions = []
    for m in models:
        prediction = m.predict(X)
        #If there are no predictions yet...
        if len(predictions) == 0:
            #the predictions is a list of our one set of predictions...
            predictions = list(map(lambda p:[p], prediction))
        else:
            #if there are, then we append to each prediction.
            for i in range(len(prediction)):
                predictions[i].append(prediction[i])
    print(predictions[0:20])
    return predictions

In [5]:
def fit_stack(models, X, Y):
    '''Fit - Make predictions based on the other models' predictions.'''
    stacked_predictions = stack_predict(models, X)
    model = LogisticRegression()
    model.fit(stacked_predictions, Y)
    return model

In [6]:
def stacked_prediction(models, model, X):
    '''Make a prediction with the trained ensemble model.'''
    stacked_prediction = stack_predict(models, X)
    prediction = model.predict(stacked_prediction)
    return prediction

In [7]:
model = fit_stack(models, x_train, y_train)
predictions = stacked_prediction(models, model, x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Stacked accuracy: {accuracy}")

[[2.0, 5.0, 5.0], [5.0, 5.0, 5.0], [2.0, 5.0, 5.0], [4.0, 5.0, 5.0], [5.0, 5.0, 5.0], [3.0, 5.0, 1.0], [4.0, 4.0, 4.0], [5.0, 5.0, 5.0], [4.0, 5.0, 5.0], [1.0, 5.0, 1.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.0], [4.0, 5.0, 5.0], [5.0, 5.0, 5.0], [4.0, 5.0, 5.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.0], [1.0, 1.0, 1.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.0]]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[[2.0, 5.0, 1.0], [4.0, 5.0, 5.0], [3.0, 5.0, 4.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.0], [1.0, 5.0, 1.0], [5.0, 5.0, 5.0], [1.0, 1.0, 1.0], [5.0, 5.0, 5.0], [4.0, 5.0, 5.0], [5.0, 5.0, 5.0], [1.0, 5.0, 1.0], [5.0, 5.0, 5.0], [4.0, 5.0, 5.0], [5.0, 5.0, 5.0], [2.0, 5.0, 5.0], [5.0, 5.0, 5.0], [5.0, 5.0, 5.