In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
import requests,io
from zipfile import ZipFile
r = requests.get('https://github.com/pulkitt15/imdb-dataset/blob/main/imdb.zip?raw=true')

with ZipFile(io.BytesIO(r.content), 'r') as zip_ref:
    zip_ref.extractall('/content/Imdb-dataset')

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import re
import os

nltk.download('stopwords')
tokenizer=ToktokTokenizer()

def review_to_words(text):
    soup = BeautifulSoup(text,"html.parser")
    text=soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub(r"[^a-zA-Z]"," ",text)
    tokens = tokenizer.tokenize(text.lower())
    tokens = [token.strip() for token in tokens]
    ps=PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in tokens]
    stop=set(stopwords.words('english'))
    filtered_tokens = [token for token in stemmed_tokens if token not in stop]
    return ' '.join(filtered_tokens)


def get_data():
    filenames = []
    for _,_,file in os.walk('/content/Imdb-dataset/imdb/pos'):
        filenames = file
    x_train=[]
    y_train=[]
   
    for filename in filenames:
         with open('/content/Imdb-dataset/imdb/pos/'+filename, 'r') as f:
             corpus = f.read()
             x_train.append(corpus)
             y_train.append(int(filename[-5]))
            

             
                    
    for _,_,file in os.walk('/content/Imdb-dataset/imdb/neg'):
        filenames = file
    for filename in filenames:
         with open('/content/Imdb-dataset/imdb/neg/'+filename, 'r') as f:
             corpus = f.read()
             x_train.append(corpus)
             y_train.append(int(filename[-5]))
            

    return x_train,y_train

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
reviews,y = get_data()
X = []
for x in reviews:
    X.append(review_to_words(x))

In [None]:
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
list_tokenized_train = tokenizer.texts_to_sequences(X)

In [None]:
maxlen = 700
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen, padding = 'post')

In [None]:
z=[]
for t in y:
  if t>=7:
    z.append(1)
  else:
    z.append(0)

y = z

In [None]:
from sklearn.model_selection import train_test_split

X_train_input, X_test_input, y_train, y_test = train_test_split(X_t,y,test_size=0.2,random_state = 2020)

In [None]:
X_train = np.array(X_train_input)
y_train = np.array(y_train)
X_test = np.array(X_test_input)
y_test = np.array(y_test)

In [5]:
max_review_length = 700
X_train = pad_sequences(X_train, maxlen=max_review_length)
X_test = pad_sequences(X_test, maxlen=max_review_length)

In [6]:
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=embedding_vector_length, input_length=max_review_length))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()
opt = optimizers.Adam(learning_rate=0.003)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 700, 32)           128000    
_________________________________________________________________
dropout (Dropout)            (None, 700, 32)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49664     
_________________________________________________________________
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 210,945
Trainable params: 210,945
Non-trainable params: 0
__________________________________________________

In [7]:
train_history = model.fit(X_train, y_train, batch_size=32,
                          epochs=10, verbose=2,
                          validation_split=0.2)

Epoch 1/10
469/469 - 25s - loss: 0.5126 - accuracy: 0.7411 - val_loss: 0.5961 - val_accuracy: 0.7195
Epoch 2/10
469/469 - 24s - loss: 0.5027 - accuracy: 0.7574 - val_loss: 0.4802 - val_accuracy: 0.8075
Epoch 3/10
469/469 - 24s - loss: 0.3545 - accuracy: 0.8566 - val_loss: 0.3366 - val_accuracy: 0.8555
Epoch 4/10
469/469 - 24s - loss: 0.2491 - accuracy: 0.9033 - val_loss: 0.3026 - val_accuracy: 0.8696
Epoch 5/10
469/469 - 24s - loss: 0.2109 - accuracy: 0.9179 - val_loss: 0.3487 - val_accuracy: 0.8683
Epoch 6/10
469/469 - 24s - loss: 0.1857 - accuracy: 0.9261 - val_loss: 0.3389 - val_accuracy: 0.8712
Epoch 7/10
469/469 - 24s - loss: 0.1593 - accuracy: 0.9384 - val_loss: 0.3324 - val_accuracy: 0.8792
Epoch 8/10
469/469 - 24s - loss: 0.1420 - accuracy: 0.9462 - val_loss: 0.3439 - val_accuracy: 0.8773
Epoch 9/10
469/469 - 24s - loss: 0.1301 - accuracy: 0.9500 - val_loss: 0.3339 - val_accuracy: 0.8792
Epoch 10/10
469/469 - 24s - loss: 0.1167 - accuracy: 0.9572 - val_loss: 0.4709 - val_accura

In [8]:
scores = model.evaluate(X_test, y_test, verbose=1)


