In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import os
import random
import pickle

In [2]:
device = torch.device('cuda') # torch.device('cpu')
device

device(type='cuda')

In [3]:
rng = np.random.default_rng(73512)

In [4]:
torch.cuda.set_device(0)
device = torch.device("cuda")

def set_all_seeds(seed):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_all_seeds(42)

In [5]:
train_data = pd.read_csv("train_data.csv")
train_data.head(10)

Unnamed: 0,review,rating
0,location not palace excellent hotel booke dthe...,4
1,respite definitely not place stay looking ultr...,3
2,stunning truly memorable spot right beach nusa...,4
3,solid business hotel near embassy stayed hotel...,3
4,nice place make sure lock money warning money ...,3
5,good hotel crowded kids weekend review like go...,3
6,good hotel overrated arrived hotel check-in ti...,2
7,"okay not fantastic, checking price upscale hot...",1
8,resort beautiful thats ends just returned 5 da...,1
9,good choice solo traveller recently stayed 4 n...,3


In [6]:
test_data = pd.read_csv("test_data.csv")
test_data.head(20)

Unnamed: 0,"great hotel location stayed 4 nts 24th 28th jan celebrating daughter 21st birthday.staff helpful getting balloons cake ordered occassion gave daughter chocolates vouchers free drink cellar bar.they recommended phillepe chow celebration meal lovely pricey special occasion booking recommended busy friday night.hotel rooms appointed turn service night molton brown products bathroom huge windows bathroom bit draughty needed towel having bath.we 18th 22nd floors no trouble noise traffic nightclub.location good minutes walk times square main shopping areas.got good deli right corner hotel called pax good downside hotel no tea/coffee facilities rooms available hotel room service expensive just tea coffee 20,"
0,"n't return overall disappointed hotel, no hot ..."
1,great value location desired problem hotel loc...
2,kind helpfull people people kind helpful.we no...
3,absolutely fabulous melia comfortable star hot...
4,"trip hell thoughts gotten, trip airport van no..."
5,lost madrid wife 10 month old recently stayed ...
6,exceptional value money thoroughly recommend h...
7,no breakfast just returned home san francisco ...
8,"great place friend stayed end april 5 nights, ..."
9,pleasant change hotels new orleans suite great...


In [14]:
train_data.shape[0]

16392

In [18]:
import re
import nltk
from tqdm.auto import tqdm
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in tqdm(range(0, train_data.shape[0])):
    review = re.sub("[^a-zA-z]", ' ', train_data["review"][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/16392 [00:00<?, ?it/s]

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = train_data["rating"].values

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
y = train_data["rating"].values

In [46]:
X.shape

(16392, 32471)

In [47]:
y.shape

(16392,)

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Making predictions based on Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
#Predicting
y_pred = classifier.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [25]:
cm

array([[ 149,   44,   14,    2,    6],
       [ 137,   70,   53,   14,   27],
       [  99,   57,   67,   44,   81],
       [ 101,   59,  101,  183,  517],
       [ 104,   48,   49,  171, 1082]], dtype=int64)

In [27]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import plot_confusion_matrix
# plot_confusion_matrix(classifier, X_test, y_test, cmap=plt.cm.Blues)
# plt.show()

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (E:\Python VirtualEnvs\ssne\lib\site-packages\sklearn\metrics\__init__.py)

In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.69      0.37       215
           1       0.25      0.23      0.24       301
           2       0.24      0.19      0.21       348
           3       0.44      0.19      0.27       961
           4       0.63      0.74      0.68      1454

    accuracy                           0.47      3279
   macro avg       0.36      0.41      0.35      3279
weighted avg       0.47      0.47      0.45      3279



# Making predictions using Logistic Regression

In [32]:
from sklearn.ensemble import AdaBoostClassifier
lr = AdaBoostClassifier()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [33]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.70      0.59       215
           1       0.36      0.25      0.29       301
           2       0.36      0.14      0.20       348
           3       0.46      0.45      0.46       961
           4       0.66      0.78      0.72      1454

    accuracy                           0.56      3279
   macro avg       0.47      0.46      0.45      3279
weighted avg       0.53      0.56      0.54      3279



In [40]:
from sklearn.ensemble import RandomForestClassifier
lr = RandomForestClassifier()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [41]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.59      0.61       215
           1       0.35      0.06      0.10       301
           2       0.53      0.05      0.09       348
           3       0.42      0.41      0.41       961
           4       0.61      0.88      0.72      1454

    accuracy                           0.56      3279
   macro avg       0.51      0.40      0.39      3279
weighted avg       0.53      0.56      0.50      3279



In [49]:
from sklearn.svm import SVC
lr = SVC()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.66      0.69       215
           1       0.54      0.45      0.49       301
           2       0.55      0.19      0.28       348
           3       0.51      0.54      0.53       961
           4       0.72      0.84      0.77      1454

    accuracy                           0.63      3279
   macro avg       0.61      0.53      0.55      3279
weighted avg       0.62      0.63      0.62      3279

