In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
df = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [4]:
df["sentiment"] = df['Rating'].apply(lambda x : 1 if x > 3 else 0)

In [5]:
import re

In [6]:
import nltk

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.metrics import confusion_matrix

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [11]:
df.head()

Unnamed: 0,Review,Rating,sentiment
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,0
2,nice rooms not 4* experience hotel monaco seat...,3,0
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,1


In [12]:
ps = PorterStemmer()

In [13]:
def cleaning(x):
    review = re.sub('[^a-zA-Z]', ' ',x)
    review = review.lower()
    review = review.split()
    review = [ps.stem(i) for i in review if not i in stopwords.words('english') ]
    review = " ".join(review)
    return review

In [14]:
df.sentiment.value_counts()

1    15093
0     5398
Name: sentiment, dtype: int64

In [15]:
part1 = df[df.sentiment == 0]

In [18]:
part2 = df[df.sentiment == 1].sample(frac = .36)

In [17]:
part2.shape

(5433, 3)

In [20]:
df_new = pd.concat([part1,part2]).sample(frac = 1)

In [21]:
df_new.sentiment.value_counts()

1    5433
0    5398
Name: sentiment, dtype: int64

In [22]:
df_new

Unnamed: 0,Review,Rating,sentiment
15642,super customer service husband just come trip ...,4,1
5891,hotel toledano aware place booking phone webpa...,2,0
11196,"did acheive 5 star rating, just not 5 star hot...",2,0
18050,usual seasons excellence great seasons hotel e...,5,1
13442,just imagined web page hotel v web page descri...,4,1
...,...,...,...
6394,disappointing service experience benjamin echo...,2,0
12545,beautiful beach bad food drinks hotel grounds ...,2,0
20041,"hotel helll, concern letter account horrible e...",1,0
13368,"poor service impressions n't great, arrival yo...",2,0


In [23]:
df_new.iloc[0:50 , 0].apply(cleaning)

15642    super custom servic husband come trip asia sta...
5891     hotel toledano awar place book phone webpag wa...
11196    acheiv star rate star hotel room tub rust toil...
18050    usual season excel great season hotel exce exp...
13442    imagin web page hotel v web page descript accu...
12147    nice hotel close circular quay pitt street mal...
11292    fabul vacat arriv hotel numer bad review head ...
19163    good experi recent day holiday bali stay night...
16776    good time n eat bit hesit write follow review ...
17043    best kept secret averag best give hotel probab...
8201     dump dump dump rate place four star dump room ...
7632     great best thing resort beauti ground except f...
9678     excel staff small room staff nadia hotel n fri...
16412    go impress staff excel servicesm fantast love ...
1968     great holiday hotel great locat right street i...
1081     expect nicer come seattl busi coupl peopl reco...
310      absolutli worst book room wife trip seattl edg.

In [24]:
df_new['review_clean'] = df_new.iloc[0:,0].apply(cleaning)

In [25]:
df_new

Unnamed: 0,Review,Rating,sentiment,review_clean
15642,super customer service husband just come trip ...,4,1,super custom servic husband come trip asia sta...
5891,hotel toledano aware place booking phone webpa...,2,0,hotel toledano awar place book phone webpag wa...
11196,"did acheive 5 star rating, just not 5 star hot...",2,0,acheiv star rate star hotel room tub rust toil...
18050,usual seasons excellence great seasons hotel e...,5,1,usual season excel great season hotel exce exp...
13442,just imagined web page hotel v web page descri...,4,1,imagin web page hotel v web page descript accu...
...,...,...,...,...
6394,disappointing service experience benjamin echo...,2,0,disappoint servic experi benjamin echo vari ex...
12545,beautiful beach bad food drinks hotel grounds ...,2,0,beauti beach bad food drink hotel ground espec...
20041,"hotel helll, concern letter account horrible e...",1,0,hotel helll concern letter account horribl exp...
13368,"poor service impressions n't great, arrival yo...",2,0,poor servic impress n great arriv young ladi d...


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
x = df_new.iloc[: , [1,3]]
y = df_new.iloc[: , [2]]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = .2)

In [29]:
cv1 = CountVectorizer()

In [30]:
cv2 = CountVectorizer(max_features=12000)

In [31]:
df_new.shape

(10831, 4)

In [32]:
train_matrix1 = cv1.fit_transform(X_train.review_clean)
train_matrix2 = cv2.fit_transform(X_train.review_clean)

In [33]:
test_matrix1 = cv1.transform(X_test.review_clean)
test_matrix2 = cv2.transform(X_test.review_clean)

In [34]:
train_matrix1

<8664x23140 sparse matrix of type '<class 'numpy.int64'>'
	with 672433 stored elements in Compressed Sparse Row format>

In [38]:
len(cv1.get_feature_names())

23140

In [36]:
from sklearn.naive_bayes import  MultinomialNB

In [37]:
model1 = MultinomialNB().fit(train_matrix1, y_train)
model2 = MultinomialNB().fit(train_matrix2, y_train)

  return f(**kwargs)
  return f(**kwargs)


In [39]:
model1.score(train_matrix1, y_train)

0.8940443213296398

In [42]:
model2.score(train_matrix2, y_train)

0.8836565096952909

In [43]:
print(model1.score(test_matrix1, y_test))
print(model2.score(test_matrix2, y_test))

0.8371019843101062
0.8375634517766497


In [45]:
confusion_matrix(y_test , model1.predict(test_matrix1))

array([[843, 240],
       [113, 971]], dtype=int64)

In [46]:
confusion_matrix(y_test , model2.predict(test_matrix2))

array([[839, 244],
       [108, 976]], dtype=int64)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
tfidf = TfidfVectorizer()

In [51]:
tf_train = tfidf.fit_transform(X_train.review_clean)
tf_test = tfidf.transform(X_test.review_clean)

In [53]:
tf_test.get_shape()

(2167, 23140)

In [268]:
tf[0].toarray().sort()

In [54]:
model_tf = MultinomialNB().fit(tf_train, y_train)

  return f(**kwargs)


In [55]:
model_tf.score(tf_train, y_train)

0.8936980609418282

In [56]:
model_tf.score(tf_test, y_test)

0.8412551915089986

In [75]:
np.argmax(tf_train[2].toarray()[0])

17216

In [76]:
tfidf.get_feature_names()[17216]

'rollaway'