In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.datasets import imdb
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
import requests,io
from zipfile import ZipFile
r = requests.get('https://github.com/pulkitt15/imdb-dataset/blob/main/imdb.zip?raw=true')

with ZipFile(io.BytesIO(r.content), 'r') as zip_ref:
    zip_ref.extractall('/content/Imdb-dataset')

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import re
import os

nltk.download('stopwords')
tokenizer=ToktokTokenizer()

def review_to_words(text):
    soup = BeautifulSoup(text,"html.parser")
    text=soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub(r"[^a-zA-Z]"," ",text)
    tokens = tokenizer.tokenize(text.lower())
    tokens = [token.strip() for token in tokens]
    ps=PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in tokens]
    stop=set(stopwords.words('english'))
    filtered_tokens = [token for token in stemmed_tokens if token not in stop]
    return ' '.join(filtered_tokens)


def get_data():
    filenames = []
    for _,_,file in os.walk('/content/Imdb-dataset/imdb/pos'):
        filenames = file
    x_train=[]
    y_train=[]
   
    for filename in filenames:
         with open('/content/Imdb-dataset/imdb/pos/'+filename, 'r') as f:
             corpus = f.read()
             x_train.append(corpus)
             y_train.append(int(filename[-5]))
            

             
                    
    for _,_,file in os.walk('/content/Imdb-dataset/imdb/neg'):
        filenames = file
    for filename in filenames:
         with open('/content/Imdb-dataset/imdb/neg/'+filename, 'r') as f:
             corpus = f.read()
             x_train.append(corpus)
             y_train.append(int(filename[-5]))
            

    return x_train,y_train

In [None]:
reviews,y = get_data()
X = []
for x in reviews:
    X.append(review_to_words(x))

In [None]:
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
list_tokenized_train = tokenizer.texts_to_sequences(X)

In [None]:
maxlen = 700
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen, padding = 'post')

In [None]:
z=[]
for t in y:
  if t>=7:
    z.append(1)
  else:
    z.append(0)

y = z

from sklearn.model_selection import train_test_split

X_train_input, X_test_input, y_train, y_test = train_test_split(X_t,y,test_size=0.2,random_state = 2020)

X_train = np.array(X_train_input)
y_train = np.array(y_train)
X_test = np.array(X_test_input)
y_test = np.array(y_test)

In [None]:
max_review_length = maxlen
X_train = pad_sequences(X_train, maxlen=max_review_length)
X_test = pad_sequences(X_test, maxlen=max_review_length)

In [None]:
word_index = imdb.get_word_index(path="imdb_word_index.json")

In [None]:
import gensim.models.keyedvectors as word2vec
import gc

In [None]:
gl_path = "/content/drive/Shareddrives/Imdb-dataset/embeddings/glove.twitter.27B.25d.txt"
ft_path = "/content/drive/Shareddrives/Imdb-dataset/embeddings/wiki.simple.vec"
wv_path = "/content/drive/Shareddrives/Imdb-dataset/embeddings/model.bin"

In [None]:
def loadEmbeddingMatrix(typeToLoad):
        if(typeToLoad=="glove"):
            EMBEDDING_FILE=gl_path
            embed_size = 25
        elif(typeToLoad=="word2vec"):
            word2vecDict = word2vec.KeyedVectors.load_word2vec_format(wv_path, binary=True)
            embed_size = 300
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE=ft_path
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            f = open(EMBEDDING_FILE)
            for line in f:
                values = line.split()
                word = values[0]
                try:
                  coefs = np.asarray(values[1:], dtype='float32')
                except:
                  continue
                if len(coefs) != embed_size:
                  continue
                embeddings_index[word] = coefs 
                
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
        else:
            embeddings_index = dict()
            for word in word2vecDict.wv.vocab:
                embeddings_index[word] = word2vecDict.word_vec(word)
            print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(word_index)
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        embeddedCount = 0
        for word, i in word_index.items():
            i-=1
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        return embedding_matrix

In [None]:
embedding_matrix = loadEmbeddingMatrix('glove')

Loaded 1193514 word vectors.
total embedded: 52944 common words


In [None]:
embedding_matrix.shape

(88584, 25)

In [None]:
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(len(word_index), embedding_matrix.shape[1],input_length = max_review_length,weights=[embedding_matrix],trainable=False))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()
opt = optimizers.Adam(learning_rate=0.003)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 700, 25)           2214600   
_________________________________________________________________
dropout_2 (Dropout)          (None, 700, 25)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               46080     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 2,293,961
Trainable params: 2,293,961
Non-trainable params: 0
____________________________________________

In [None]:
train_history = model.fit(X_train, y_train, batch_size=32,
                          epochs=10, verbose=2,
                          validation_split=0.2)

Epoch 1/10
500/500 - 34s - loss: 0.1838 - accuracy: 0.9275 - val_loss: 0.2855 - val_accuracy: 0.9022
Epoch 2/10
500/500 - 34s - loss: 0.1758 - accuracy: 0.9311 - val_loss: 0.2799 - val_accuracy: 0.9013
Epoch 3/10
500/500 - 33s - loss: 0.1703 - accuracy: 0.9359 - val_loss: 0.2843 - val_accuracy: 0.8997
Epoch 4/10
500/500 - 33s - loss: 0.1617 - accuracy: 0.9384 - val_loss: 0.2646 - val_accuracy: 0.8978
Epoch 5/10
500/500 - 33s - loss: 0.1476 - accuracy: 0.9421 - val_loss: 0.3292 - val_accuracy: 0.8917
Epoch 6/10
500/500 - 33s - loss: 0.1462 - accuracy: 0.9432 - val_loss: 0.2926 - val_accuracy: 0.9020
Epoch 7/10
500/500 - 33s - loss: 0.1417 - accuracy: 0.9465 - val_loss: 0.3235 - val_accuracy: 0.8992
Epoch 8/10
500/500 - 33s - loss: 0.1368 - accuracy: 0.9484 - val_loss: 0.2752 - val_accuracy: 0.9047
Epoch 9/10
500/500 - 33s - loss: 0.1377 - accuracy: 0.9470 - val_loss: 0.3107 - val_accuracy: 0.8992
Epoch 10/10
500/500 - 33s - loss: 0.1214 - accuracy: 0.9544 - val_loss: 0.3112 - val_accura

In [None]:
scores = model.evaluate(X_test, y_test, verbose=1)

