In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

In [2]:
### Load the dataset

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
## To remove tags associated with it
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result



In [6]:
## Removing the tags
df['review']=df['review'].apply(lambda cw : remove_tags(cw))

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
#df2.head()

In [9]:
## Only retaning text
def remove_tags_1(string):
    result = re.sub('[^a-z\sA-Z]+','',string)
    return result

In [10]:
## Only retaning and text and removing other things
df['review']=df['review'].apply(lambda cw : remove_tags_1(cw))

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [12]:
len(df)

50000

In [13]:
## removing stop words
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching Oz episode yo...,positive
1,A wonderful little production The filming tech...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically theres family little boy Jake thinks...,negative
4,Petter Matteis Love Time Money visually stunni...,positive


In [15]:
## Coverting the data into lower
df['review'] = df["review"].map(lambda x: x.lower())

In [16]:
## Label encoding the sentiments
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() 
df['sentiment']= label_encoder.fit_transform(df['sentiment'])

In [17]:
## Splitting the data into train and test
df_train = df[:40000]
df_test = df[40000:]

In [18]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df_train['review'])
sequences = tokenizer.texts_to_sequences(df_train['review'])
data = pad_sequences(sequences, maxlen=50)

In [19]:
data

array([[1481,    1,  195, ..., 1088, 3991,  419],
       [1651, 7064, 6418, ..., 1835,   17,  136],
       [4977,    1,  109, ...,   61,   14,  245],
       ...,
       [  64,  514, 5024, ...,   47,   56,   49],
       [4274,   73,  611, ...,  201,   11,   75],
       [1755,   46,  189, ...,   24,  353,   11]], dtype=int32)

In [20]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,a wonderful little production the filming tech...,1
2,i thought wonderful way spend time hot summer ...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [21]:
## Network architecture
model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
## Fit the model
model.fit(data,df_train['sentiment'], validation_split=0.2, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa839d30c50>

In [22]:
txt = ["The movie was very good and watch the movie","The movie was bad"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=50)
pred = model.predict(padded)
labels = ['sport', 'bussiness', 'politics', 'tech', 'entertainment'] 

print(pred)

[[0.82287955]
 [0.22799715]]


In [30]:
from sklearn.metrics import classification_report

In [31]:
seq = tokenizer.texts_to_sequences(df_test['review'])
padded = pad_sequences(seq, maxlen=50)
pred = model.predict(padded)

In [37]:
test_pred = [1 if num>0.5  else 0 for num in pred]

In [38]:
print(classification_report(df_test['sentiment'],test_pred))

              precision    recall  f1-score   support

           0       0.86      0.81      0.84      4993
           1       0.82      0.87      0.85      5007

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [39]:
## A decent performance of model

In [24]:
from tensorflow.keras.models import save_model

In [25]:
## saving the model
save_model(model,"model_text_1.hdf5",)

In [26]:
import pickle

In [27]:
## saving the tokeniser
with open('tokenizer1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)