In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
data = pd.read_csv("./Sentiment_analysis/Reviews.csv")
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data.Liked.unique()

array([1, 0])

In [5]:
data.shape

(1000, 2)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [7]:
data.Liked.value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [8]:
corpus=[]
stemmer = PorterStemmer()
for sen in data.Review:
    sen = sen.lower()
    sen = re.sub(r'[^A-Za-z0-9]'," ",sen)
    sen = sen.split()
    sen = [stemmer.stem(word) for word in sen if word not in stopwords.words("english")]
    sen = " ".join(sen)
    corpus.append(sen)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
train = TfidfVectorizer().fit_transform(corpus).toarray()


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train,data.Liked,test_size=0.2,random_state=42)


In [11]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)

In [12]:
from sklearn.metrics import accuracy_score as ac
ac(y_test,model.predict(x_test))

0.68

In [24]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=50)
model.fit(x_train,y_train)
ac(y_test,model.predict(x_test))

0.705

In [40]:
from sklearn.svm import SVC
model = SVC(kernel="rbf",C=2)
model.fit(x_train,y_train)
ac(y_test,model.predict(x_test))


0.76

In [37]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=80)
model.fit(x_train,y_train)
ac(y_test,model.predict(x_test))


0.715

In [41]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)
ac(y_test,model.predict(x_test))

0.73

In [107]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Bidirectional,GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping




In [46]:
max([len(i.split()) for i in data.Review])

32

In [90]:
max_len=30
tokenizer=Tokenizer(num_words=500,oov_token="<OOV>")

In [91]:
x_train,x_test,y_train,y_test = train_test_split(data.Review,data.Liked,test_size=0.2,random_state=42)

In [92]:
tokenizer.fit_on_texts(x_train)
train_seq = tokenizer.texts_to_sequences(x_train)
print(train_seq[0])
train_seq = pad_sequences(train_seq,maxlen=max_len)

[2, 138, 5, 2, 305, 369]


In [93]:
data.Review[0]

'Wow... Loved this place.'

In [94]:
train_seq[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2, 138,
         5,   2, 305, 369], dtype=int32)

In [95]:
test_seq = tokenizer.texts_to_sequences(x_test)
test_seq = pad_sequences(test_seq,maxlen=max_len)


In [96]:
train_seq.shape


(800, 30)

In [97]:
test_seq.shape

(200, 30)

In [128]:
model = Sequential()
model.add(Embedding(500,32))
model.add(LSTM(128,activation="relu",input_shape=(30,)))
model.add(Dense(64,activation="relu"))
# model.add(Dropout(0.2))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

  super().__init__(**kwargs)


In [139]:
num_epochs = 15

history = model.fit(train_seq,y_train,epochs=num_epochs,batch_size=20,validation_data=(test_seq,y_test),verbose=1)


Epoch 1/15
[1m 1/40[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 31ms/step - accuracy: 1.0000 - loss: 5.3760e-04

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9991 - loss: 0.0072 - val_accuracy: 0.7250 - val_loss: 331.1568
Epoch 2/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9980 - loss: 0.0059 - val_accuracy: 0.7350 - val_loss: 264.4519
Epoch 3/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9980 - loss: 0.0092 - val_accuracy: 0.7300 - val_loss: 311.3526
Epoch 4/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9984 - loss: 0.0056 - val_accuracy: 0.7300 - val_loss: 350.0914
Epoch 5/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9983 - loss: 0.0068 - val_accuracy: 0.7250 - val_loss: 371.6112
Epoch 6/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9983 - loss: 0.0076 - val_accuracy: 0.7300 - val_loss: 400.3835
Epoch 7/15
[1m40/40[0m [32m━━━

In [140]:
ac(y_test,[1 if i>0.5 else 0 for i in model.predict(test_seq) ])

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


0.72