## Classification of reviews from imdb.com

Download the dataset from [kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

## Import libraries

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

## Load dataset

In [11]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/IMDB_dataset/clean_tweets')

In [12]:
X = df["tweet"]
Y = df["sentiment"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=900, \
                                                    shuffle=True, stratify=Y)
X_train.shape, X_test.shape

((40000,), (10000,))

## TextVectorization

In [14]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(df.iloc[:, 0].values)

In [15]:
BATCH_SIZE = 64

## Training

In [17]:
model = tf.keras.Sequential([encoder,
                             tf.keras.layers.Embedding(
                                 input_dim=len(encoder.get_vocabulary()),
                                 output_dim=64,
                                 # Use masking to handle the variable sequence lengths
                                 mask_zero=True
                             ),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=.3, return_sequences = True)),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                             tf.keras.layers.Dense(64, activation='relu'),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(1)
                            ])

In [18]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 64)          64000     
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                        

In [20]:
history = model.fit(X_train, pd.factorize(y_train)[0], epochs=10,
                    validation_data=(X_test, pd.factorize(y_test)[0]),  
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
