In [117]:
import pandas as pd
import string
import nltk
import numpy as np
import sklearn
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.metrics import classification_report,accuracy_score

In [2]:
# Load dataset from metadata file
file=open("/media/spaggy/New Volume/Sem VI/ML/Project/YelpZip/metadata")
data=file.readlines()
file.close()
for ind in range(len(data)):
    data[ind]=data[ind].split()

In [3]:
# Load reviews from reviewcontent file
file2=open("/media/spaggy/New Volume/Sem VI/ML/Project/YelpZip/reviewContent")
text=file2.readlines()
file2.close()
for i in range(len(text)):
    text[i]=text[i].split("\t")
text=[t[3] for t in text]

In [4]:
Dataset=pd.DataFrame(data,columns=["user_id",'product_id','rating','label','date'])
Dataset["review_text"]=text

In [9]:
Dataset.shape

(608598, 7)

In [5]:
Dataset['rating']=Dataset['rating'].astype('float')
Dataset['date']=pd.to_datetime(Dataset['date'])
Dataset['day']=Dataset['date'].dt.day_name()

In [31]:
Dataset.head()

Unnamed: 0,user_id,product_id,rating,label,date,review_text,day
0,5044,0,1.0,-1,2014-11-16,"Drinks were bad, the hot chocolate was watered...",Sunday
1,5045,0,1.0,-1,2014-09-08,This was the worst experience I've ever had a ...,Monday
2,5046,0,3.0,-1,2013-10-06,This is located on the site of the old Spruce ...,Sunday
3,5047,0,5.0,-1,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,Sunday
4,5048,0,5.0,-1,2014-08-28,I love Toast! The food choices are fantastic -...,Thursday


### Split train-test dataset

In [30]:
y=pd.DataFrame({'label':Dataset["label"]})
x=Dataset.drop(['label'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

### Hand Crafted Features

In [34]:
# X_train['upper_case_word_count']=X_train['review_text'].apply(lambda x:len([y for y in x.split() if y) )
#Train Dataset
#Review centric features
X_train['word_count']=X_train['review_text'].apply(lambda x: len(x.split()))
X_train['punctuation_count']=X_train['review_text'].apply(lambda x: len(''.join(c for c in x if c in string.punctuation)))
X_train['char_count']=X_train['review_text'].apply(lambda x: len(x))
X_train['title_count']=X_train['review_text'].apply(lambda x: len([word for word in x.split() if word.istitle()]))
print("-----------review_centric features extracted-------------/")
# user-centric features
X_train['user_id_no_of_review'] = X_train.groupby('user_id')['user_id'].transform('size')
X_train['user_id_ave_rating'] = X_train.groupby('user_id')['rating'].transform('mean')
X_train['user_id_ave_no_words'] = X_train.groupby('user_id')['word_count'].transform('mean')
X_train['user_id_max_review_a_day'] = X_train.groupby(['user_id','day'])['user_id'].transform('size')
print("-----------user_centric features extracted-------------/")
#Product centric feature
X_train['product_id_no_of_review']=X_train.groupby('product_id')['product_id'].transform('size')
X_train['product_id_ave_rating']=X_train.groupby('product_id')['rating'].transform('mean')
X_train['product_id_ave_no_of_words']=X_train.groupby('product_id')['word_count'].transform('mean')
X_train['product_id_max_review_a_day']=X_train.groupby(['product_id','day'])['user_id'].transform('size')
print("-----------product_centric features extracted-------------/\n\n")

#Test dataset
X_test['word_count']=X_test['review_text'].apply(lambda x: len(x.split()))
X_test['punctuation_count']=X_test['review_text'].apply(lambda x: len(''.join(c for c in x if c in string.punctuation)))
X_test['char_count']=X_test['review_text'].apply(lambda x: len(x))
X_test['title_count']=X_test['review_text'].apply(lambda x: len([word for word in x.split() if word.istitle()]))
print("-----------review_centric features extracted-------------/")
# user-centric features
X_test['user_id_no_of_review'] = X_test.groupby('user_id')['user_id'].transform('size')
X_test['user_id_ave_rating'] = X_test.groupby('user_id')['rating'].transform('mean')
X_test['user_id_ave_no_words'] = X_test.groupby('user_id')['word_count'].transform('mean')
X_test['user_id_max_review_a_day'] = X_test.groupby(['user_id','day'])['user_id'].transform('size')
print("-----------user_centric features extracted-------------/")
#Product centric feature
X_test['product_id_no_of_review']=X_test.groupby('product_id')['product_id'].transform('size')
X_test['product_id_ave_rating']=X_test.groupby('product_id')['rating'].transform('mean')
X_test['product_id_ave_no_of_words']=X_test.groupby('product_id')['word_count'].transform('mean')
X_test['product_id_max_review_a_day']=X_test.groupby(['product_id','day'])['user_id'].transform('size')
print("-----------product_centric features extracted-------------/")

-----------review_centric features extracted-------------/
-----------user_centric features extracted-------------/
-----------product_centric features extracted-------------/


-----------review_centric features extracted-------------/
-----------user_centric features extracted-------------/
-----------product_centric features extracted-------------/


### Text Preprocessing

In [72]:
#Function for expanding all the contractions in the paragraph
with open("Contractions.json",'r') as file:
   Contractions=json.load(file)
c_re = re.compile('(%s)' % '|'.join(Contractions.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return Contractions[match.group(0)]
    return c_re.sub(replace, text)

In [None]:
# lowercase,remove digits,punctuations,expand contractions,stopwords and lemmatization
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
X_train['review_text']=X_train['review_text'].apply(lambda x: x.decode('utf-8','ignore'))
X_train['review_text']=X_train['review_text'].apply(lambda x: expandContractions(x))
X_train['review_text']=X_train['review_text'].apply(lambda x:x.lower())   #Convert into lowercase
X_train['review_text']=X_train['review_text'].apply(lambda x : ''.join([c for c in x if not c.isdigit()])) #Remove numeric digits
X_train['review_text']=X_train['review_text'].apply(lambda x:  ''.join([c for c in x if c not in string.punctuation]))  # Remove punctuations
X_train['review_text']=X_train['review_text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
X_train['review_text']=X_train['review_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Test Dataset
X_test['review_text']=X_test['review_text'].apply(lambda x: x.decode('utf-8','ignore'))
X_test['review_text']=X_test['review_text'].apply(lambda x: expandContractions(x))
X_test['review_text']=X_test['review_text'].apply(lambda x:x.lower())   #Convert into lowercase
X_test['review_text']=X_test['review_text'].apply(lambda x : ''.join([c for c in x if not c.isdigit()])) #Remove numeric digits
X_test['review_text']=X_test['review_text'].apply(lambda x:  ''.join([c for c in x if c not in string.punctuation]))  # Remove punctuations
X_test['review_text']=X_test['review_text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
X_test['review_text']=X_test['review_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

In [33]:
pd.set_option('mode.chained_assignment', None)

In [47]:
X_train

Unnamed: 0,user_id,product_id,rating,date,review_text,day,word_count,punctuation_count,char_count,title_count,user_id_no_of_review,user_id_ave_rating,user_id_ave_no_words,user_id_max_review_a_day,product_id_no_of_review,product_id_ave_rating,product_id_ave_no_of_words,product_id_max_review_a_day
353682,185036,3085,1.0,2013-02-20,star one overrated new hipster spot montclair,Wednesday,12,2,68,3,1,1.000000,12.000000,1,178,4.202247,102.719101,22
313172,41781,1433,5.0,2014-04-15,came last friday wedding rehearsal dinner exce...,Tuesday,78,10,419,6,5,4.600000,73.400000,3,875,3.864000,129.496000,119
105323,34537,4922,5.0,2014-07-09,dear cucina zapata please forgive strayed in f...,Wednesday,206,26,1087,27,3,4.000000,244.000000,1,96,4.708333,121.833333,20
338630,46438,2940,4.0,2007-12-23,yes name address may take little effort reserv...,Sunday,178,20,922,14,4,3.250000,104.500000,3,42,4.761905,194.523810,8
126404,36282,1153,3.0,2014-12-14,high hope brunch cafe ghia felt food okay two ...,Sunday,68,13,368,10,4,4.500000,100.500000,3,181,3.883978,101.928177,38
210174,128176,1222,5.0,2013-06-26,im gluten intolerant eating tricky havent thai...,Wednesday,54,8,271,8,1,5.000000,54.000000,1,496,3.826613,125.175403,69
403531,202275,437,1.0,2012-10-21,food bland dry ordered chef medley garlic bok ...,Sunday,52,11,277,9,1,1.000000,52.000000,1,132,3.643939,104.446970,22
468328,195527,3888,4.0,2014-07-06,small little restaurant hell kitchen area huge...,Sunday,27,6,169,5,2,4.000000,26.500000,1,1035,4.019324,101.642512,159
503706,139576,4169,4.0,2011-01-11,near credit suisse meeting stopped see hype we...,Tuesday,86,13,440,16,4,4.250000,87.500000,3,144,4.312500,85.076389,25
516681,8410,4299,3.0,2011-07-10,wonderful dinner food coma induced stroke like...,Sunday,338,58,1851,32,31,3.645161,209.838710,13,526,4.309886,149.674905,64


### Model

### Classification on Processed review text

In [96]:
# Count vectorizer for review_text
count_vect=CountVectorizer(analyzer='word')
count_vect.fit(X_train.review_text)
feature_vector_train=count_vect.transform(X_train.review_text)
feature_vector_test =count_vect.transform(X_test.review_text)
#Count Vectorizer for day
count_vect_=CountVectorizer(analyzer='word')
count_vect_.fit(X_train.day)

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [111]:
# Count vectors + Features
feature_vector_train=hstack((feature_vector_train,np.array(X_train["user_id_no_of_review"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["user_id_ave_rating"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["user_id_ave_no_words"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["user_id_max_review_a_day"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["product_id_no_of_review"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["product_id_ave_rating"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["product_id_ave_no_of_words"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["product_id_max_review_a_day"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["word_count"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["punctuation_count"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["char_count"])[:,None]))
feature_vector_train=hstack((feature_vector_train,np.array(X_train["title_count"])[:,None]))
feature_vector_train=hstack((feature_vector_train,count_vect_.transform(X_train["day"])))

feature_vector_test=hstack((feature_vector_test,np.array(X_test["user_id_no_of_review"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["user_id_ave_rating"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["user_id_ave_no_words"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["user_id_max_review_a_day"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["product_id_no_of_review"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["product_id_ave_rating"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["product_id_ave_no_of_words"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["product_id_max_review_a_day"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["word_count"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["punctuation_count"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["char_count"])[:,None]))
feature_vector_test=hstack((feature_vector_test,np.array(X_test["title_count"])[:,None]))
feature_vector_test=hstack((feature_vector_test,count_vect_.transform(X_test["day"])))

In [112]:
LR= LogisticRegression()
LR.fit(feature_vector_train,y_train)
predictions_train=LR.predict(feature_vector_train)
y_pred=LR.predict(feature_vector_test)

  y = column_or_1d(y, warn=True)


In [110]:
feature_vector_test

<182580x264573 sparse matrix of type '<type 'numpy.float64'>'
	with 14358599 stored elements in COOrdinate format>

In [115]:
print(classification_report(y_train,predictions_train))

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.65      0.05      0.10     56387
           1       0.87      1.00      0.93    369631

   micro avg       0.87      0.87      0.87    426018
   macro avg       0.76      0.52      0.51    426018
weighted avg       0.84      0.87      0.82    426018

              precision    recall  f1-score   support

          -1       0.19      0.41      0.26     24079
           1       0.89      0.74      0.81    158501

   micro avg       0.70      0.70      0.70    182580
   macro avg       0.54      0.58      0.54    182580
weighted avg       0.80      0.70      0.74    182580



In [118]:
print("Accuracy of LR on train dataset:",accuracy_score(y_train,predictions_train))
print("Accuracy of LR on test dataset :",accuracy_score(y_test,y_pred))

('Accuracy of LR on train dataset:', 0.8707378561469233)
('Accuracy of LR on test dataset :', 0.6965713659765582)


### Classification on Processed review text

In [119]:
# count_vect=CountVectorizer(analyzer='word')
# count_vect.fit(X_train.review_text)
X_train_count=count_vect.transform(X_train.review_text)
X_test_count =count_vect.transform(X_test.review_text)

LR= LogisticRegression()
LR.fit(X_train_count,y_train)
predictions_train=LR.predict(X_train_count)
predictions_test=LR.predict(X_test_count)

In [120]:
print(classification_report(y_train,predictions_train))

print(classification_report(y_test,predictions_test))

              precision    recall  f1-score   support

          -1       0.88      0.19      0.31     56387
           1       0.89      1.00      0.94    369631

   micro avg       0.89      0.89      0.89    426018
   macro avg       0.89      0.59      0.62    426018
weighted avg       0.89      0.89      0.86    426018

              precision    recall  f1-score   support

          -1       0.14      0.49      0.22     24079
           1       0.88      0.56      0.68    158501

   micro avg       0.55      0.55      0.55    182580
   macro avg       0.51      0.52      0.45    182580
weighted avg       0.78      0.55      0.62    182580



In [121]:
print("Accuracy of LR on train dataset:",accuracy_score(y_train,predictions_train))
print("Accuracy of LR on test dataset :",accuracy_score(y_test,predictions_test))

('Accuracy of LR on train dataset:', 0.8892042120285998)
('Accuracy of LR on test dataset :', 0.5489703143827364)


### Classification on review text

In [129]:
count_vect=CountVectorizer(analyzer='word')
count_vect.fit(X_train.review_text)
X_train_count=count_vect.transform(X_train.review_text)
X_test_count =count_vect.transform(X_test.review_text)

LR= LogisticRegression()
LR.fit(X_train_count,y_train)
predictions_train=LR.predict(X_train_count)
predictions_test=LR.predict(X_test_count)

In [133]:
print(classification_report(y_train,predictions_train))

print(classification_report(y_test,predictions_test))

              precision    recall  f1-score   support

          -1       0.83      0.17      0.29     56193
           1       0.89      0.99      0.94    369825

   micro avg       0.89      0.89      0.89    426018
   macro avg       0.86      0.58      0.61    426018
weighted avg       0.88      0.89      0.85    426018

              precision    recall  f1-score   support

          -1       0.40      0.08      0.13     24273
           1       0.87      0.98      0.93    158307

   micro avg       0.86      0.86      0.86    182580
   macro avg       0.64      0.53      0.53    182580
weighted avg       0.81      0.86      0.82    182580



In [131]:
print(accuracy_score(y_train,predictions_train))

0.8863850823204653


In [132]:
print(accuracy_score(y_test,predictions_test))

0.862115237156315


In [128]:
y=pd.DataFrame({'label':Dataset["label"]})
x=Dataset.drop(['label'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)