In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.shape

(1000, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


### Start Processing

In [6]:
df.Liked.value_counts(1)*100

1    50.0
0    50.0
Name: Liked, dtype: float64

In [7]:
a = df.iloc[1]['Review']
a

'Crust is not good.'

In [8]:
import re
# from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus=[]
for i in range(0,len(df)):
    message = re.sub('[^a-zA-Z]',' ',df['Review'][i])
    message = message.lower()
    message = message.split()
#     message = [ps.stem(word) for word in message]
    message = [lemmatizer.lemmatize(word) for word in message]
    message = ' '.join(message)
    corpus.append(message)

In [9]:
len(corpus)

1000

In [32]:
# corpus

In [11]:
df['Review'].shape

(1000,)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500)
corpus = vectorizer.fit_transform(corpus)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus,df['Liked'],test_size=0.25,random_state=42)

### Model Build

In [14]:
from sklearn.metrics import accuracy_score, classification_report

## Logistic

In [15]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train,y_train)

LogisticRegression()

In [16]:
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.82

In [17]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       128
           1       0.84      0.78      0.81       122

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[110,  18],
       [ 27,  95]], dtype=int64)

### Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train,y_train)

MultinomialNB(alpha=0.2)

In [20]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)

0.808

In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[103,  25],
       [ 23,  99]], dtype=int64)

## SVC

In [22]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

SVC()

In [23]:
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

0.82

In [24]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       128
           1       0.85      0.76      0.81       122

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[112,  16],
       [ 29,  93]], dtype=int64)

## RandomForest

In [27]:
# params = random_search.best_params_
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestClassifier().fit(X_train,y_train)

In [28]:
from sklearn.model_selection import RandomizedSearchCV
params={
    "n_estimators":[200,300,400,500,600],
"max_depth" : [50,100,150,200],
"min_samples_split" : [1,2,3,4,5,6,7,8,9],
"min_samples_leaf" : [1,2,3,4,5,6,7,8,9] }

random_search = RandomizedSearchCV (estimator=rfr,param_distributions=params,n_iter
=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3,random_state=45)
random_search.fit(corpus,df['Liked'])

print('Best roc_auc: {:.4}, with best C: {}'.format(random_search.best_score_,random_search.best_params_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   33.9s finished


Best roc_auc: 0.8627, with best C: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 100}


In [33]:
y_pred = random_search.predict(X_test)
accuracy_score(y_test,y_pred)

1.0

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[128,   0],
       [  0, 122]], dtype=int64)

In [35]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       128
           1       1.00      1.00      1.00       122

    accuracy                           1.00       250
   macro avg       1.00      1.00      1.00       250
weighted avg       1.00      1.00      1.00       250



In [36]:
## use random_search or classifier for deployment

### User

In [37]:
strr = input("Enter a Message: ")
print("-------------------------------")
examples = strr

#preprocess
a = re.sub('[^a-zA-Z]',' ',examples)
a = a.lower()
a = a.split()
a = [lemmatizer.lemmatize(word) for word in a ]
a = ' '.join(a)  
print(a)
print("--------------------------------")
#apply
example_counts = vectorizer.transform([a])
prediction =classifier.predict(example_counts)
prediction[0]

if prediction[0]==0:
    print("This is Negative Review")
elif prediction[0]==1:
    print("This is Positive Review")

Enter a Message: Pratik Gandhi is simply the best... The famous dialogue "Risk hai to Ishq hai" keeps going on and on in your mind even after the series is over.  The BGM is outstanding goes well with the series, Great job Achint!  After a long time came across a clean series with a good story...
-------------------------------
pratik gandhi is simply the best the famous dialogue risk hai to ishq hai keep going on and on in your mind even after the series is over the bgm is outstanding go well with the series great job achint after a long time came across a clean series with a good story
--------------------------------
This is Positive Review
