In [83]:
import pandas as pd
import numpy as np
import sklearn
import random
import nltk
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics

In [2]:
#I don't think you guys would need to run this at all
# import os
# os.getcwd()
# os.chdir('d:\\Harris Course work\\MLPP\\ml_proj\\obamacare-ml-project')

In [3]:
# setting file path and chunk size
file_path = 'data/yelp_academic_dataset_review.json'
chunk_size = 1000

#Calculating size of the dataset
with open(file_path, 'r', encoding='utf-8') as f:
    num_records = sum(1 for line in f)

#Generating a random set of indices to subset
subset_size = int(num_records * 0.1)  # load 10% of the records
indices = random.sample(range(num_records), subset_size)

# Loading the rows corresponding to the randomly generated indices
chunks = []
json_reader = pd.read_json(file_path, lines=True, chunksize=chunk_size)
for chunk in json_reader:
    filtered_chunk = chunk[chunk.index.isin(indices)]
    if not filtered_chunk.empty:
        chunks.append(filtered_chunk)

# concatenate the chunks into a single DataFrame
df = pd.concat(chunks)


In [4]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
12,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4,0,0,0,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01
16,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5,0,0,0,Tremendous service (Big shout out to Douglas) ...,2013-06-24 11:21:25
19,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5,0,0,0,My absolute favorite cafe in the city. Their b...,2014-11-12 15:30:27
25,qS6kE7CDoDagyPZwmueJaQ,zoBajEyVA0z4IjbFsMJksg,c-IgS6Pk6vMyax7Rbr38eA,4,0,0,0,Went for lunch. Beef brisket sandwich was awes...,2015-06-08 19:45:48
28,DyrAIuKl60j_X8Yrrv-kpg,mNsVyC9tQVYtzLOCbh2Piw,MWmXGQ98KbRo3vsS5nZhMA,5,1,0,0,I recently had dinner here with my wife over t...,2014-10-27 02:47:28


In [5]:
df.loc[:,'stars'].value_counts(normalize=True)

5    0.462854
4    0.208492
1    0.152344
3    0.098301
2    0.078008
Name: stars, dtype: float64

In [6]:
yelp_reviews = df.loc[:, ['text','stars', 'useful','funny','cool']]

In [7]:
yelp_reviews.shape

(699028, 5)

In [8]:
yelp_reviews.head()

Unnamed: 0,text,stars,useful,funny,cool
12,Love going here for happy hour or dinner! Gre...,4,0,0,0
16,Tremendous service (Big shout out to Douglas) ...,5,0,0,0
19,My absolute favorite cafe in the city. Their b...,5,0,0,0
25,Went for lunch. Beef brisket sandwich was awes...,4,0,0,0
28,I recently had dinner here with my wife over t...,5,1,0,0


In [9]:
# Tokenizing and lemmatizing: I had some issue switching to Eshan's branch.
# Please edit if I missed a step
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soumy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soumy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\soumy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
#Pre-processing strings
yelp_reviews.loc[:,'text'] = yelp_reviews.loc[:,'text'].str.lower()

In [11]:
#Removing punctuation
punc = r'[{}]+'.format(string.punctuation)
yelp_reviews.loc[:,'text'] = yelp_reviews.loc[:,'text'].apply(lambda x: re.sub(punc,'',x))

In [12]:
yelp_reviews.loc[:,'text'] = yelp_reviews.loc[:,'text'] .apply(lambda x: nltk.word_tokenize(x))

In [13]:
stop_words = nltk.corpus.stopwords.words('english')
yelp_reviews.loc[:,'text'] = yelp_reviews.loc[:,'text'].apply(lambda x: [i for i in x if i not in stop_words])

In [14]:
lemmatizer = nltk.stem.WordNetLemmatizer()
yelp_reviews.loc[:,'text'] = yelp_reviews.loc[:,'text'].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])

Model 1: Binary Logistic Regression

In [15]:
yelp_reviews.head()

Unnamed: 0,text,stars,useful,funny,cool
12,"[love, going, happy, hour, dinner, great, pati...",4,0,0,0
16,"[tremendous, service, big, shout, douglas, com...",5,0,0,0
19,"[absolute, favorite, cafe, city, black, white,...",5,0,0,0
25,"[went, lunch, beef, brisket, sandwich, awesome...",4,0,0,0
28,"[recently, dinner, wife, weekend, could, pleas...",5,1,0,0


In [None]:
table_useful = pd.crosstab(yelp_reviews.loc[:,'useful'], yelp_reviews.loc[:,'stars'])
table_useful

Useful has 134 unique values. 91% of the data is restricted to values of useful < 5

In [None]:
yelp_reviews.loc[:,'useful'].value_counts(normalize=True)

In [None]:
table_funny = pd.crosstab(yelp_reviews.loc[:,'funny'], yelp_reviews.loc[:,'stars'])
table_funny

In [None]:
yelp_reviews.loc[:,'funny'].value_counts(normalize=True)

84% of data is just the 0 label. 100 items but very little variance.

In [None]:
table_cool = pd.crosstab(yelp_reviews.loc[:,'cool'], yelp_reviews.loc[:,'stars'])
table_cool

In [None]:
yelp_reviews.loc[:,'cool'].value_counts(normalize=True)

76% of data is label 0 and 14% is label 1.

**Logistic Regression- Part 1**
starsV1:
Creating two classes:
    labels 1-3 = bad (0)
    labels 4-5 = good (1)

In [16]:
# Making stars dichotomous with threshold of 3
yelp_reviews.loc[:, 'starsv1'] = 1
yelp_reviews.loc[yelp_reviews.loc[:,'stars'] < 4, 'starsv1'] = 0

In [87]:
yelp_reviews.loc[:,'starsv1'].value_counts(normalize=True)

1    0.671346
0    0.328654
Name: starsv1, dtype: float64

Case 2: Creating 3 classes
starsV2:
    labels 1-2 = bad (0)
    Label 3 = Neutral (1)
    labels 4-5 = good (2)

In [17]:
yelp_reviews.loc[:, 'starsv2'] = 2
yelp_reviews.loc[yelp_reviews.loc[:,'stars'] < 3, 'starsv2'] = 0
yelp_reviews.loc[yelp_reviews.loc[:,'stars'] == 3, 'starsv2'] = 1

In [18]:
yelp_reviews.head()

Unnamed: 0,text,stars,useful,funny,cool,starsv1,starsv2
12,"[love, going, happy, hour, dinner, great, pati...",4,0,0,0,1,2
16,"[tremendous, service, big, shout, douglas, com...",5,0,0,0,1,2
19,"[absolute, favorite, cafe, city, black, white,...",5,0,0,0,1,2
25,"[went, lunch, beef, brisket, sandwich, awesome...",4,0,0,0,1,2
28,"[recently, dinner, wife, weekend, could, pleas...",5,1,0,0,1,2


In [19]:
#For dichotomous data in starsV1
dict_pos_v1 = {}
dict_neg_v1 = {}
for list_w, star in zip(yelp_reviews.loc[:,'text'], yelp_reviews.loc[:,'starsv1']):
    if  star == 1:
        for word in list_w:
            if word not in dict_pos_v1:
                dict_pos_v1[word] = 0
            dict_pos_v1[word] += 1
    else:
        for word in list_w:
            if word not in dict_neg_v1:
                dict_neg_v1[word] = 0
            dict_neg_v1[word] += 1




In [29]:
#For data in starsV2
dict_pos_v2 = {}
dict_neg_v2 = {}
dict_neut_v2 ={}
for list_w, star in zip(yelp_reviews.loc[:,'text'], yelp_reviews.loc[:,'starsv2']):
    if  star == 2:
        for word in list_w:
            if word not in dict_pos_v2:
                dict_pos_v2[word] = 0
            dict_pos_v2[word] += 1
    elif star == 0:
        for word in list_w:
            if word not in dict_neg_v2:
                dict_neg_v2[word] = 0
            dict_neg_v2[word] += 1
    else:
        for word in list_w:
            if word not in dict_neut_v2:
                dict_neut_v2[word] = 0
            dict_neut_v2[word] += 1

In [30]:
def cumulative_count(list_tokens, pos_dict, neg_dict, neut_dict = {}):
    count_dict = {'pos':0, 'neg':0, 'neut':0}
    for word in list_tokens:
        if word in pos_dict:
            count_dict['pos'] += pos_dict[word]
        if word in neg_dict:
            count_dict['neg'] += neg_dict[word]
        if word in neut_dict:
            count_dict['neut'] += neut_dict[word]    
    return count_dict

In [31]:
yelp_reviews.loc[:,'cum_count_v1'] = yelp_reviews.loc[:,'text'].apply(lambda x: cumulative_count(x, dict_pos_v1, dict_neg_v1))
yelp_reviews.loc[:,'cum_count_v2'] = yelp_reviews.loc[:,'text'].apply(lambda x: cumulative_count(x, dict_pos_v2, dict_neg_v2, dict_neut_v2))


In [33]:
def split_dict(dict_c, key):
    return dict_c[key]


In [34]:
yelp_reviews.loc[:,'posv1'] = yelp_reviews.loc[:, 'cum_count_v1'].apply(lambda x: split_dict(x,'pos'))
yelp_reviews.loc[:,'negv1'] = yelp_reviews.loc[:, 'cum_count_v1'].apply(lambda x: split_dict(x,'neg'))
yelp_reviews.loc[:,'posv2'] = yelp_reviews.loc[:, 'cum_count_v2'].apply(lambda x: split_dict(x,'pos'))
yelp_reviews.loc[:,'neutv2'] = yelp_reviews.loc[:, 'cum_count_v2'].apply(lambda x: split_dict(x,'neut'))
yelp_reviews.loc[:,'negv2'] = yelp_reviews.loc[:, 'cum_count_v2'].apply(lambda x: split_dict(x,'neg'))

In [36]:
yelp_reviews.drop('cum_count_v1', axis=1, inplace=True)
yelp_reviews.drop('cum_count_v2', axis=1, inplace=True)

In [70]:
yelp_reviews.head()

Unnamed: 0,text,stars,useful,funny,cool,starsv1,starsv2,posv1,negv1,posv2,neutv2,negv2
12,"[love, going, happy, hour, dinner, great, pati...",4,0,0,0,1,2,815923,373849,815923,132754,241095
16,"[tremendous, service, big, shout, douglas, com...",5,0,0,0,1,2,926870,513365,926870,170857,342508
19,"[absolute, favorite, cafe, city, black, white,...",5,0,0,0,1,2,1403905,670485,1403905,232332,438153
25,"[went, lunch, beef, brisket, sandwich, awesome...",4,0,0,0,1,2,456859,220638,456859,94089,126549
28,"[recently, dinner, wife, weekend, could, pleas...",5,1,0,0,1,2,1303879,684392,1303879,226474,457918


**Binary Data with all Labels**

In [62]:
X = yelp_reviews.loc[:,['posv1', 'negv1', 'useful', 'funny', 'cool']].to_numpy()
y = yelp_reviews.loc[:,'starsv1'].to_numpy()
clf = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [63]:
accuracy_v1 = accuracy_score(y_test, y_pred)
precision_v1 = precision_score(y_test, y_pred, average='weighted')
f1_v1 = f1_score(y_test, y_pred, average='weighted')
recall_v1 = recall_score(y_test, y_pred, average='weighted')

In [64]:
print(f"Accuracy: {accuracy_v1}")
print(f"Precision: {precision_v1}")
print(f"Recall: {recall_v1}")
print(f"F1 Score: {f1_v1}")

Accuracy: 0.8119680128177618
Precision: 0.8079780190065572
Recall: 0.8119680128177618
F1 Score: 0.8079392347761343


**Binary data with random forest to select important features**

In [65]:
selection = SelectFromModel(RandomForestClassifier(n_estimators=100))
selection.fit(X_train, y_train)

In [66]:
selection.get_support()

array([ True,  True, False, False, False])

The results from the random forest show that only the first two features i.e. posv1 and negv1 are important. Running the model again using these two features.

In [88]:
X_rf = yelp_reviews.loc[:,['posv1', 'negv1']].to_numpy()
y_rf = yelp_reviews.loc[:,'starsv1'].to_numpy()
clf_rf = LogisticRegression()
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
clf_rf.fit(X_train_rf, y_train_rf)
y_pred_rf = clf.predict(X_test_rf)

In [89]:
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
precision_rf = precision_score(y_test_rf, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test_rf, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test_rf, y_pred_rf, average='weighted')

In [90]:
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")

Accuracy: 0.8119680128177618
Precision: 0.8079780190065572
Recall: 0.8119680128177618
F1 Score: 0.8079392347761343


**3 classes **

In [72]:
X = yelp_reviews.loc[:,['posv2', 'neutv2', 'negv2']]
y = yelp_reviews.loc[:,'starsv2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
lrm = LogisticRegression(multi_class='ovr', solver='liblinear')
lrm.fit(X_train, y_train)

In [82]:
yhat = lrm.predict(X_test)

In [85]:
metrics.confusion_matrix(y_test ,yhat)

array([[21894,  1428,  9022],
       [ 3615,  1888,  8347],
       [ 5102,  2154, 86356]], dtype=int64)

In [86]:
metrics.classification_report(y_test, yhat)

'              precision    recall  f1-score   support\n\n           0       0.72      0.68      0.70     32344\n           1       0.35      0.14      0.20     13850\n           2       0.83      0.92      0.88     93612\n\n    accuracy                           0.79    139806\n   macro avg       0.63      0.58      0.59    139806\nweighted avg       0.76      0.79      0.77    139806\n'

It seems like the logistic regression model with two classes (0 for stars 1-3 and 1 for stars 4-5) works better as compared to the three class model. Furthermore, using the random forest model, we were able to identify the best features for our model i.e. posv1 (number of occurences of words in text in positive connotation) and negv1 (number of occurences of words in text in negative connotation)