In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('Zomato_reviews.csv')
data.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [3]:
data['rating'].value_counts()

4.0    8632
5.0    8118
3.0    3762
1.0    3126
2.0    1675
3.5    1078
4.5     933
2.5     261
1.5     177
Name: rating, dtype: int64

In [4]:
data.isnull().sum()

rating          0
review_text    14
dtype: int64

In [5]:
#droppng null rows
data.dropna(axis = 0, inplace = True)

In [6]:
data.isnull().sum()

rating         0
review_text    0
dtype: int64

In [7]:
data.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [8]:
data.shape

(27748, 2)

In [9]:
reviews = data['review_text'].tolist()

In [10]:
reviews[0:2]

['Their service is worst, pricing in menu is different from bill. They can give you a bill with increased pricing. Even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . I have tried the thattil kutti dosa I've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it"]

In [11]:
reviews_lower = [review.lower() for review in reviews]

In [12]:
reviews_lower[:5]

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly bett

In [13]:
#getting rid of extra line breaks
reviews1 = [' '.join(txt.split()) for txt in reviews_lower]

In [14]:
reviews1[:5]

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better coo

In [15]:
#tokenizing
tokenized = [word_tokenize(txt) for txt in reviews1]

In [16]:
tokenized[:2]

[['their',
  'service',
  'is',
  'worst',
  ',',
  'pricing',
  'in',
  'menu',
  'is',
  'different',
  'from',
  'bill',
  '.',
  'they',
  'can',
  'give',
  'you',
  'a',
  'bill',
  'with',
  'increased',
  'pricing',
  '.',
  'even',
  'for',
  'serving',
  'water',
  ',',
  'menu',
  ',',
  'order',
  'you',
  'need',
  'to',
  'call',
  'them',
  '3-4',
  'times',
  'even',
  'on',
  'a',
  'non',
  'busy',
  'day',
  '.'],
 ['really',
  'appreciate',
  'their',
  'quality',
  'and',
  'timing',
  '.',
  'i',
  'have',
  'tried',
  'the',
  'thattil',
  'kutti',
  'dosa',
  'i',
  "'ve",
  'been',
  'addicted',
  'to',
  'the',
  'dosa',
  'really',
  'and',
  'the',
  'chutney',
  '...',
  'really',
  'good',
  'and',
  'money',
  'worth',
  'much',
  'better',
  'than',
  'a',
  'thattukada',
  'must',
  'try',
  'it']]

In [17]:
#remove stopwords
stopword = stopwords.words('english')
for i in ['no', 'not', 'don', 'won']:
    stopword.remove(i)
reviews_stop_rem = [[txt for txt in review if txt not in stopword] for review in tokenized]

In [18]:
#remove punctuations
punct = list(punctuation) + ["...", "``","''", "===="]
reviews_pun = [[txt for txt in review if txt not in punct] for review in reviews_stop_rem]

In [19]:
reviews_pun[:2]

[['service',
  'worst',
  'pricing',
  'menu',
  'different',
  'bill',
  'give',
  'bill',
  'increased',
  'pricing',
  'even',
  'serving',
  'water',
  'menu',
  'order',
  'need',
  'call',
  '3-4',
  'times',
  'even',
  'non',
  'busy',
  'day'],
 ['really',
  'appreciate',
  'quality',
  'timing',
  'tried',
  'thattil',
  'kutti',
  'dosa',
  "'ve",
  'addicted',
  'dosa',
  'really',
  'chutney',
  'really',
  'good',
  'money',
  'worth',
  'much',
  'better',
  'thattukada',
  'must',
  'try']]

In [20]:
#join the review text for model building
review_text = [' '.join(review)  for review in reviews_pun]

In [21]:
review_text[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada must try"]

In [22]:
X = review_text
y = data['rating']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [46]:
#feature extraction from our reviews by tfidf vectorizer
vectorizer = TfidfVectorizer(max_features = 5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [47]:
randforest = RandomForestRegressor()
randforest.fit(X_train_vect, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [48]:
pred = randforest.predict(X_test_vect)

In [49]:
print(pred)

[4.  1.6 4.7 ... 3.9 3.  4. ]


In [50]:
print('MSE :', mean_squared_error(y_test, pred))

MSE : 0.26342529369691153


## Hyper parameter Tuning

In [51]:
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 20, 25]
}

In [52]:
grid_search = GridSearchCV(estimator = randforest, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [53]:
grid_search.fit(X_train_vect, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
      

In [32]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
                      max_features=500, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [54]:
pred = grid_search.best_estimator_.predict(X_test_vect)

In [55]:
print('MSE :', mean_squared_error(y_test, pred))

MSE : 0.4711784459787698


In [56]:
## Mismach identifying

In [57]:
data_test = pd.DataFrame({'review' : X_test, 'rating' : y_test, 'rating_pred' : pred})

In [58]:
#mismatched reviews
data_test[(data_test.rating - data_test.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
7279,life saviours serving excellent food worst tim...,5.0,1.706052
7300,veg biriyani loved ordered night 10 got within...,5.0,2.651404
4771,not good,5.0,2.399398
16512,may not polished serving packaging etc never b...,5.0,1.885702
14847,oh memories place first drink bangalore almost...,5.0,2.31544
15203,sauce not included,4.0,1.879089
16517,may not polished serving packaging etc never b...,5.0,1.885702
1796,veg meals serve banana leaf food like home mad...,4.0,1.998439


In [59]:
data_test[(data_test.rating - data_test.rating_pred)>=2].shape

(8, 3)

## There are 8 mismatched reviews which are shown above