In [1]:
import pandas as pd

In [2]:
Data = pd.read_csv(r"C:/Users/User/Restaurant_Reviews.tsv",delimiter = '\t')

In [3]:
Data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
len(Data)

1000

In [5]:
Data.shape

(1000, 2)

# pre-processing steps include reomoval of stopwords, punctation removal, stemming


In [6]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
Corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', Data['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]
    review = " ".join(review)
    Corpus.append(review)

In [8]:
Y = Data.iloc[:,1].values

# Data is divided into Training set, Validation set and Testing set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(Corpus, Y, test_size=0.30, random_state=None)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, stratify = y_temp, test_size=0.5, random_state=None)

# Vectorization
Machine learning model understands only numerical data,
to convert word to vector i used TFIDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vt = TfidfVectorizer(lowercase=False,vocabulary=None,tokenizer=None)
X_vt = tfidf_vt.fit_transform(X_train)
X_vt_1 = tfidf_vt.transform(X_test)
X_vt_2 = tfidf_vt.transform(X_val)

In [11]:
from sklearn import svm

lin = svm.SVC()
lin.fit(X_vt, y_train)
y= lin.predict(X_vt_1)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y))
print('\n')
print(classification_report(y_test, y))



[[74  0]
 [76  0]]


              precision    recall  f1-score   support

           0       0.49      1.00      0.66        74
           1       0.00      0.00      0.00        76

    accuracy                           0.49       150
   macro avg       0.25      0.50      0.33       150
weighted avg       0.24      0.49      0.33       150



  'precision', 'predicted', average, warn_for)


# Model hyperparameter tuning

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
#create new a knn model
lin_clf2 = svm.SVC()

#create a dictionary of all values we want to test for n_neighbors
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
                   ]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

#use gridsearch to test all values for n_neighbors
svm_gscv = GridSearchCV(lin_clf2, tuned_parameters, cv=5)

#fit model to data
svm_gscv.fit(X_vt, y_train)

# Tuning hyper-parameters for precision

# Tuning hyper-parameters for recall



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000],
                          'gamma': [0.01, 0.001, 0.0001, 1e-05],
                          'kernel': ['rbf']},
                         {'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000],
                          'gamma': [0.01, 0.001, 0.0001, 1e-05],
                          'kernel': ['sigmoid']},
                         {'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000],
                          'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, ret

In [13]:
svm_gscv.best_params_


{'C': 100, 'gamma': 0.01, 'kernel': 'sigmoid'}

In [14]:
svm_gscv.best_score_


0.7471428571428571

In [15]:
import pickle
pickle_file = 'Resturant.pkl'
with open(pickle_file,'wb')  as file:
    pickle.dump(svm_gscv,file)

In [16]:
import pickle
pickle_file = 'Resturant_t.pkl'
with open(pickle_file,'wb')  as file:
    pickle.dump(tfidf_vt,file)

#Fit the best parameters and test on test data 

In [17]:
User = input("Enter a Review")
User =  User.split(",")
user = tfidf_vt.transform(User)
o = svm_gscv.predict(user)
if o==1:
    print("Recomended")
else:
    print("Not recomended")


Enter a Reviewthe food is worest
Not recomended
