In [None]:
import json
import pandas as pd
import numpy as np
import csv
import os
import ast

import nltk
import gensim

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# test classification dataset
from collections import Counter
from sklearn.datasets import make_classification

# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

In [None]:
firehouse = pd.read_csv('firehouse_with_reviews.csv', sep = ',')
subway = pd.read_csv('subway_with_reviews.csv', sep = ',')
jmike = pd.read_csv('jersey_mike_with_reviews.csv', sep = ',')
jj = pd.read_csv('jimmy_johns_with_reviews.csv', sep = ',')
potbelly = pd.read_csv('potbelly_with_reviews.csv', sep = ',')
quiznos = pd.read_csv('quiznos_with_reviews.csv', sep = ',')

In [None]:
firehouse.head()
subway.head()

In [None]:
frames = [firehouse, subway, jmike, jj, potbelly, quiznos]
df = pd.concat(frames)

In [None]:
df.tail()

In [None]:
# merge reviews texts by business_id in df_merge
df_merge = df.groupby(['business_id'], as_index=False).first()

In [None]:
df_merge

In [None]:
df_review = df_merge[['business_id', 'name', 'address', 'stars_x', 'text']]
df_review.head()

## NLP Text Clean

In [None]:
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english') #stopwords.append("n't")

In [None]:
# Load the regular expression library
import re

# Remove punctuation
df_review['text_processed'] = \
df_review['text'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df_review['text_processed'] = \
df_review['text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
df_review['text_processed'].tail()

In [None]:
df_review.head()

In [None]:
processed_txt

In [None]:
merge_review = []
for sent in df_review['text_processed']:
    merge_review.append(sent)

In [None]:
merge_review

## TD-IDF Vectorizer

In [None]:
# USE THIS CODE FOR DATA FRAME
vect = TfidfVectorizer(ngram_range = (1, 2))
tfidf_matrix = vect.fit_transform(merge_review)
print(tfidf_matrix.shape)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names_out())

In [None]:
df.head()

In [None]:
vect.get_feature_names_out()

## N-Grams

In [None]:
from nltk import ngrams

# sentence = input("Enter the sentence: ")
# n = int(input("Enter the value of n: "))
lst_grams = []
sentence = 'My husband and I ordered sandwiches for pickup today, which we do a couple times a month. His pastrami was well-made with a good portion of deli meat. My tuna... well at first I thought they entirely forgot the tuna! Pulled it apart to find a tiny spoonful on each half-- absolutely laughable amount of filling! I order this sandwich regularly and it\'s NEVER like that. He took it back and asked for a remake,l. The assistant managers said "it\'s our portion size", and never apologized or offered either a refund or a remake. So $7 for a teaspoon of tuna, and apathetic response from the employees... you ARE KIDDING ME!! Customer service standards and quality need an overhaul in this location.'
n_grams = ngrams(sentence.split(), 2)
for grams in n_grams:
    lst_bigrams.append(grams)

With review sentences:

In [None]:
corpus = ','.join(merge_review)

In [None]:
corpus

In [None]:
# split reviews corpus into bigrams
from nltk import ngrams
lst_bigrams = []
n_grams = ngrams(corpus.split(), 2)
for grams in n_grams:
    lst_bigrams.append(grams)

In [None]:
lst_bigrams

In [None]:
# getting bigram counts
bigram_frequency = {}
for char in lst_bigrams:
    if char in bigram_frequency:
        bigram_frequency[char] += 1
    else:
        bigram_frequency[char] = 1
print(bigram_frequency)

In [None]:
# sort by descending count
bigram_frequency_sorted = sorted(bigram_frequency.items(), 
                              key = lambda kv: kv[1], 
                              reverse = True)
type(bigram_frequency_sorted)

In [None]:
bigram_df = pd.DataFrame(bigram_frequency_sorted)
bigram_df.columns = ['bigrams', 'counts']

In [None]:
bigram_df.to_csv('bigram_counts.csv')

## Linear Regression

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import classification_report
import scipy.stats as stats

In [None]:
# merge tf-idf and business data
# df = tf-idf matrix
pd.concat([df_review, df], axis=1)

MLR choosing single words and n-grams:

In [None]:
df_select = df[["bread", "staff", "beef", "lunch", "cheese", "fresh", "fast", 
                "flatbread", "teriyaki", 'meatball', 'italian', 'tomato', 'steak']]

In [None]:
df_select

In [None]:
x = df_select
y = df_review['stars_x']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_samples, test_size= 834 - train_samples, random_state = 11) # 80/20 split

In [None]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [None]:
model = LinearRegression().fit(X_train, y_train)

In [None]:
r_sq = model.score(x, y)
r_sq

In [None]:
print(f"intercept: {model.intercept_}")

In [None]:
print(f"coefficients: {model.coef_}")

In [None]:
yhat = model.predict(X_test)

In [None]:
yhat = np.around(yhat, decimals = 1)

In [None]:
yhat = yhat.astype(int)

In [None]:
# accuracy for MLR
score = accuracy_score(yhat, y_test)
score

## Multinomial Logistic Regression

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.utils.multiclass import type_of_target
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

prep data train & test sets

In [None]:
train_samples = 667
X = df_select
y = df_review['stars_x']

# type_of_target(y)
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)
y = y.astype('int')
type_of_target(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_samples, test_size= 834 - train_samples, random_state = 11) # 80/20 split

In [None]:
y_test

model accuracy with CV:

In [None]:
# define the multinomial logistic regression model with a default penalty
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', 
                           C=1.0, max_iter = 1000000)

# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# predictions
result = model.fit(X, y)

In [None]:
y_test # test data response array

In [None]:
row = X_test# .iloc[0:1, :] # first row of test data
# predict a multinomial probability distribution
p = model.predict_proba(row)
# summarize the predicted probabilities
print(p) # 'Predicted Probabilities: %s' %

In [None]:
# predict the class label
yhat = model.predict(row)
print(yhat) # predicted output

In [None]:
# printing beta coefficients
print(result.intercept_)
print(result.coef_)

In [None]:
# gives coefficients chart with features
summary = pd.DataFrame(zip(X.columns, np.transpose(result.coef_.tolist()[0])), 
                       columns=['features', 'coef'])

In [None]:
print(summary)

In [None]:
# test set accuracy 
score = accuracy_score(yhat, y_test)
score