## Reading required packages:
#### sklearn: package which has functions used to perform machine learning 
#### nltk: package used to perform natural language processing
#### pandas: package used to read and write files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import confusion_matrix

### Copying stop words into stop object

In [2]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

### Defining a stemmer object to extract root words

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

### Stop words:
Some examples of stop words are: "a", "and", "but", "how", "or", and "what." While the majority of all Internet search engines utilize stop words, they do not prevent a user from using them, but they are ignored.

### Stemming:
A stemmer for English, for example, should identify the string "cats" (and possibly "catlike", "catty" etc.) as based on the root "cat", and "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming algorithm reduces the words "fishing", "fished", and "fisher" to the root word, "fish"

In [4]:
# read yelp.csv into a DataFrame

yelp = pd.read_csv("yelp.csv",encoding = "ISO-8859-1")

yelp.head()

Unnamed: 0.1,Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [5]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

yelp_best_worst.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
3,3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
6,6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4


In [6]:
yelp.shape

(10000, 11)

In [7]:
# define X and y
X = yelp_best_worst.text
X.apply(lambda x: [item for item in x if item not in stop])

def stem1(text1):
    return ps.stem(text1)

X=X.apply(stem1)
print(X)

0       my wife took me here on my birthday for breakf...
1       i have no idea why some people give bad review...
3       rosie, dakota, and i love chaparral dog park!!...
4       general manager scott petello is a good egg!!!...
6       drop what you're doing and drive here. after i...
9       nobuo shows his unique talents with everything...
10      the oldish man who owns the store is as sweet ...
11      wonderful vietnamese sandwich shoppe. their ba...
12      they have a limited time thing going on right ...
17      okay this is the best place ever! i grew up sh...
21      this place shouldn't even be reviewed - becaus...
22      first time my friend and i went there... it wa...
23      u can go there n check the car out. if u wanna...
24      i love this place! i have been coming here for...
26      i love love love this place. my boss (who is i...
30      disclaimer: like many of you, i am a sucker fo...
31      disgusting!  had a groupon so my daughter and ...
32      never 

In [8]:
y = yelp_best_worst.stars
print(y)

0       5
1       5
3       5
4       5
6       5
9       5
10      5
11      5
12      5
17      5
21      5
22      5
23      1
24      5
26      5
30      5
31      1
32      5
35      1
46      5
51      5
54      5
59      5
61      1
64      1
65      1
66      5
67      5
69      5
71      1
       ..
9941    5
9942    5
9943    5
9945    5
9947    5
9951    5
9953    1
9956    5
9957    5
9959    5
9965    5
9966    5
9969    5
9970    5
9971    5
9973    5
9975    5
9977    5
9978    5
9979    5
9980    5
9981    5
9984    1
9987    1
9989    5
9990    5
9991    5
9992    5
9994    5
9999    5
Name: stars, Length: 4086, dtype: int64


In [9]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Displaying top few rows of yelp dataframe

In [10]:
cveg = CountVectorizer()

x_example = cveg.fit_transform(["how are you", "We are are fine", "are we fine", "yes we are fine"])
x_example.toarray()

array([[1, 0, 1, 0, 0, 1],
       [2, 1, 0, 1, 0, 0],
       [1, 1, 0, 1, 0, 0],
       [1, 1, 0, 1, 1, 0]], dtype=int64)

In [11]:
cveg.get_feature_names()

['are', 'fine', 'how', 'we', 'yes', 'you']

### Vectorizer transformation example for one hot encoding

|are | fine | how | we | yes | you |        
| :- |: | :| :|: | : 
|1| 0  | 1 | 0 | 0 | 1 
|2| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 1 | 0 


In [12]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [13]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [14]:
# use CountVectorizer with text column only
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


In [15]:
print(X_train_dtm)

  (0, 5447)	1
  (0, 9565)	1
  (0, 7704)	1
  (0, 11053)	1
  (0, 14210)	1
  (0, 16038)	1
  (0, 3541)	1
  (0, 2187)	1
  (0, 971)	1
  (0, 14582)	1
  (0, 5406)	1
  (0, 13327)	1
  (0, 5967)	2
  (0, 768)	1
  (0, 2188)	1
  (0, 6471)	1
  (0, 12796)	1
  (0, 5423)	1
  (1, 5804)	1
  (1, 1268)	1
  (1, 15555)	1
  (2, 3329)	1
  (2, 5192)	1
  (2, 5826)	1
  (2, 8574)	1
  :	:
  (2859, 9022)	1
  (2859, 4464)	1
  (2859, 14592)	1
  (2859, 11667)	1
  (2859, 5146)	1
  (2859, 15491)	1
  (2859, 330)	1
  (2859, 9143)	1
  (2859, 14275)	1
  (2859, 14641)	2
  (2859, 14774)	1
  (2859, 7935)	2
  (2859, 1480)	1
  (2859, 2227)	1
  (2859, 6642)	2
  (2859, 839)	1
  (2859, 9507)	1
  (2859, 15360)	1
  (2859, 6944)	1
  (2859, 6790)	1
  (2859, 10035)	2
  (2859, 15769)	2
  (2859, 5804)	2
  (2859, 14582)	2
  (2859, 768)	2


In [16]:
print(X_test_dtm)

  (0, 329)	1
  (0, 716)	2
  (0, 761)	2
  (0, 768)	6
  (0, 799)	1
  (0, 1023)	1
  (0, 1216)	1
  (0, 1587)	4
  (0, 1928)	2
  (0, 2215)	1
  (0, 2745)	1
  (0, 2850)	1
  (0, 3825)	1
  (0, 4072)	1
  (0, 4170)	1
  (0, 4464)	1
  (0, 4592)	1
  (0, 4806)	1
  (0, 5153)	1
  (0, 5826)	1
  (0, 6318)	1
  (0, 6766)	1
  (0, 6807)	3
  (0, 6827)	1
  (0, 6989)	3
  :	:
  (1225, 10109)	2
  (1225, 10169)	1
  (1225, 12063)	2
  (1225, 12141)	1
  (1225, 12317)	1
  (1225, 12394)	1
  (1225, 13023)	1
  (1225, 13327)	1
  (1225, 13390)	1
  (1225, 13599)	1
  (1225, 13822)	1
  (1225, 14506)	1
  (1225, 14617)	2
  (1225, 14641)	1
  (1225, 14681)	1
  (1225, 14774)	2
  (1225, 15360)	1
  (1225, 15701)	1
  (1225, 15823)	3
  (1225, 15893)	1
  (1225, 15939)	1
  (1225, 15963)	2
  (1225, 15984)	1
  (1225, 16038)	1
  (1225, 16114)	1


### Apply logistic regression for classification

In [17]:
# create a model use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
#Do the prediction
y_pred_class = logreg.predict(X_test_dtm)
#check the model accuracy
print(metrics.accuracy_score(y_test, y_pred_class))

0.9216965742251223




### Print confusion matrix

In [18]:
confusion_matrix(y_test, y_pred_class)

array([[174,  49],
       [ 47, 956]], dtype=int64)

In [19]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred_class))
print('\n')
print(classification_report(y_test, y_pred_class))

[[174  49]
 [ 47 956]]


              precision    recall  f1-score   support

           1       0.79      0.78      0.78       223
           5       0.95      0.95      0.95      1003

   micro avg       0.92      0.92      0.92      1226
   macro avg       0.87      0.87      0.87      1226
weighted avg       0.92      0.92      0.92      1226



Looks like our model has achieved 92% accuracy! This means that our model can predict whether a user liked a local business or not, based on what they typed!


https://www.kaggle.com/omkarsabnis/sentiment-analysis-on-the-yelp-reviews-dataset

https://medium.com/tensorist/classifying-yelp-reviews-using-nltk-and-scikit-learn-c58e71e962d9