In [1]:
#installing packages + libraries (contain necessary functions) necessary to manipulate the dataset
# pandas + numpy to manipulate dataset
# Scikit-learn for splitting data into testing + training sets
! pip install tensorflow scikit-learn pandas numpy nltk



In [2]:
# importing some libraries
import numpy as np
import pandas as pd
import nltk

In [3]:
df = pd.read_csv("yelp_1000.csv")
#this is just a larger version of the dataset df = pd.read_csv("https://github.com/justmarkham/DAT7/raw/master/data/yelp.csv")
df.head()

Unnamed: 0,dir,file,cool,review_id,stars,date,business_id,funny,text,user_id,useful,score
0,sentiyelp/Charlotte_North_Carolina,-cZ6Hhc9F7VkKXxHMVZSQ.csv,0,BLfLwh7w4NGHU7eBLgnaFg,4,2013-04-28,-cZ6Hhc9F7VkKXxHMVZSQ,0,SO GOOD!!! I did not dine in I ordered take ou...,YmAqQyaFli8H9LVGhGhR9w,0,0.970178
1,sentiyelp/Charlotte_North_Carolina,-cZ6Hhc9F7VkKXxHMVZSQ.csv,0,X6-KGabYKJTS1Dsipo4XIw,3,2013-05-08,-cZ6Hhc9F7VkKXxHMVZSQ,0,"The food is tasty, no doubt, but rotisserie ch...",ciXjBfJrAEteIKpzZg4I9g,1,0.782714
2,sentiyelp/Charlotte_North_Carolina,-cZ6Hhc9F7VkKXxHMVZSQ.csv,1,b_OaEC8uyqIUN-BtX6KNQA,4,2013-05-11,-cZ6Hhc9F7VkKXxHMVZSQ,1,Pio Pio was recommended by a friend a few mont...,17qZxRhTcp_JSnEg65COkA,3,0.950852
3,sentiyelp/Charlotte_North_Carolina,-cZ6Hhc9F7VkKXxHMVZSQ.csv,0,QBe4o5-8NNzDuHkM7Jc05A,2,2013-05-18,-cZ6Hhc9F7VkKXxHMVZSQ,0,"I didnt hate Pio Pio, but I didnt like it enou...",gc4WUy07eaQYPGOly1t-Ww,1,0.123886
4,sentiyelp/Charlotte_North_Carolina,-cZ6Hhc9F7VkKXxHMVZSQ.csv,0,I_Fm3v1N9nrWqNqTS80ZMQ,5,2013-05-18,-cZ6Hhc9F7VkKXxHMVZSQ,0,I returned to Pio Pio with my family on a Frid...,EHq8HvrDG-VaJ8rjyjVUYQ,1,0.790467


In [4]:
#Removing Extraneous Columns
remove_columns = ['dir', 'file', 'cool', 'review_id', 'stars','date','business_id', 'funny', 'user_id', 'useful', 'score']
df = df.drop(remove_columns, axis = 1)
df.head()

Unnamed: 0,text
0,SO GOOD!!! I did not dine in I ordered take ou...
1,"The food is tasty, no doubt, but rotisserie ch..."
2,Pio Pio was recommended by a friend a few mont...
3,"I didnt hate Pio Pio, but I didnt like it enou..."
4,I returned to Pio Pio with my family on a Frid...


In [5]:
#DATA PREPROCESSING
# this tokenizes the data then joins the string back together
# making data in 'text' column lowercase
df['lower'] = df['text'].str.lower()
# Use regex substitution to remove punctuation from 'lower' column
import re
pattern = r'[^\w\s]'
df['no_punctuation'] = df['lower'].apply(lambda x: re.sub(pattern, '', x))
# removing stopwords from 'no_punctuation' column
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords #nltk is a massive nlp dataset that already has a usable list of stopwords
stop_words = stopwords.words('english')
df['cleaned'] = df['no_punctuation'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
# df.head()
# print(df.loc[0:5,'text'])
print(df.loc[0,'cleaned'])

good dine ordered take took little longer expected get food well worth got home order roasted chicken rice beans fried yucca boom delicious could something simple good pio pio knows everything full flavor portion size typical top like spanish restaurants price paying worth back


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rheaagrawal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['cleaned'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
# df.iloc[:10]
print(df.loc[0,'lemmatized'])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rheaagrawal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


good dine ordered take took little longer expected get food well worth got home order roasted chicken rice bean fried yucca boom delicious could something simple good pio pio know everything full flavor portion size typical top like spanish restaurant price paying worth back


In [7]:
#Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['stemmed'] = df['cleaned'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))
# df.head()
print(df.loc[0,'stemmed'])

good dine order take took littl longer expect get food well worth got home order roast chicken rice bean fri yucca boom delici could someth simpl good pio pio know everyth full flavor portion size typic top like spanish restaur price pay worth back


In [8]:
#Sentiment Analysis with Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
def calculate_sentiment(review): 
    polarity = analyzer.polarity_scores(review)
#     print(review)
#    print(polarity)
    #positive sentiment
    if polarity['compound']>=0.05:
        return "1"
    #negative sentiment
    elif polarity['compound']<=-0.05:
        return "-1"
    #neutral sentiment
    elif polarity['compound']>-0.05 and polarity['compound']<0.05:
        return "0"

df['sentiment'] = df['stemmed'].apply(calculate_sentiment)
#print(df.loc[0:50, 'sentiment'])
#print(df.loc[19,['text','sentiment']])
#print(df.loc[40, ['text', 'stemmed','sentiment']])
# df.iloc[0:50]
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rheaagrawal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,text,lower,no_punctuation,cleaned,lemmatized,stemmed,sentiment
0,SO GOOD!!! I did not dine in I ordered take ou...,so good!!! i did not dine in i ordered take ou...,so good i did not dine in i ordered take out i...,good dine ordered take took little longer expe...,good dine ordered take took little longer expe...,good dine order take took littl longer expect ...,1
1,"The food is tasty, no doubt, but rotisserie ch...","the food is tasty, no doubt, but rotisserie ch...",the food is tasty no doubt but rotisserie chic...,food tasty doubt rotisserie chicken similar ev...,food tasty doubt rotisserie chicken similar ev...,food tasti doubt rotisseri chicken similar eve...,1
2,Pio Pio was recommended by a friend a few mont...,pio pio was recommended by a friend a few mont...,pio pio was recommended by a friend a few mont...,pio pio recommended friend months ago group us...,pio pio recommended friend month ago group u w...,pio pio recommend friend month ago group us we...,1
3,"I didnt hate Pio Pio, but I didnt like it enou...","i didnt hate pio pio, but i didnt like it enou...",i didnt hate pio pio but i didnt like it enoug...,didnt hate pio pio didnt like enough go back e...,didnt hate pio pio didnt like enough go back e...,didnt hate pio pio didnt like enough go back e...,1
4,I returned to Pio Pio with my family on a Frid...,i returned to pio pio with my family on a frid...,i returned to pio pio with my family on a frid...,returned pio pio family friday night see food ...,returned pio pio family friday night see food ...,return pio pio famili friday night see food wo...,1


In [9]:
#Creating the Training Model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
X = tfidf.fit_transform(df['stemmed'])
from sklearn.model_selection import train_test_split
Y = df['sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.8, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
logisticR = LogisticRegression(solver="liblinear")
logisticR.fit(X_train, Y_train)
predictions = logisticR.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions, Y_test)
print(accuracy)

0.935
