In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix

df = pd.read_csv('data/movie_review.csv')
df = pd.DataFrame(df)
df = df.drop(columns=['fold_id', 'cv_tag', 'html_id', 'sent_id'])
#Convert tag into 1 or 0 based on positive or negative
df['tag'] = df.tag.astype('category').cat.codes
review = df['text'].values
y = df['tag'].values
review_train, review_test, y_train, y_test = train_test_split(
    review, y, test_size=0.3, random_state=1)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(review_train)
token_X_train = tokenizer.texts_to_sequences(review_train)
token_X_test = tokenizer.texts_to_sequences(review_test)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
token_X_train_text = tokenizer.sequences_to_texts(token_X_train)
maxlen = 100
padded_X_train = pad_sequences(token_X_train, padding='post', maxlen=maxlen)
padded_X_test = pad_sequences(token_X_test, padding='post', maxlen=maxlen)
embedding_dim = 30
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()
history = model.fit(padded_X_train, y_train,epochs=10,verbose=2,validation_data=(padded_X_test, y_test),batch_size=10)
model.save('model.h1')



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 30)           1135110   
_________________________________________________________________
flatten (Flatten)            (None, 3000)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                30010     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 1,165,131
Trainable params: 1,165,131
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
4531/4531 - 102s - loss: 0.6512 - accuracy: 0.6065 - val_loss: 0.6155 - val_accuracy: 0.6591
Epoch 2/10
4531/4531 - 98s - loss: 0.5277 - accuracy: 0.7325 - val_loss: 0.6502 - val_accuracy: 0.6399
Epoch 3/10
4531/4531 - 98s - loss: 0.