In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('./logs/processedData.csv')

In [3]:
df.head()

Unnamed: 0,major_sent,label
0,go saw movie last night coax friend mine admit...,1
1,actor turn director bill paxton follow promisi...,1
2,recreational golfer knowledge sport history pl...,1
3,saw film sneak preview delightful cinematograp...,1
4,bill paxton take true story golf open make fil...,1


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
unique_words_count=len(set([w for sent in df.major_sent for w in sent.split()]))

tokenizer=Tokenizer(num_words=unique_words_count)

tokenizer.fit_on_texts(df.major_sent)

encoded_sentances=tokenizer.texts_to_sequences(df.major_sent)

In [6]:
MAX_SEQ_LEN=500
padded_encoded_sequences=pad_sequences(encoded_sentances,maxlen=MAX_SEQ_LEN)

In [7]:
padded_encoded_sequences

array([[    0,     0,     0, ...,   769,     6,   992],
       [    0,     0,     0, ...,   424,    38,   162],
       [    0,     0,     0, ...,   373,    39,    97],
       ...,
       [    0,     0,     0, ...,     8,     1,    12],
       [    0,     0,     0, ..., 13516,  1019,   531],
       [    0,     0,     0, ...,   212,     9,  1151]])

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(padded_encoded_sequences,df.label.values,test_size=0.33,random_state=42)

In [9]:
import tensorflow.keras as keras

In [10]:
model=\
keras.Sequential([
    keras.layers.Embedding(unique_words_count+1, 64, input_length=MAX_SEQ_LEN),
    keras.layers.LSTM(units=128),
    keras.layers.Dense(units=64,activation='relu'),
    keras.layers.Dense(units=32,activation='relu'),
    keras.layers.Dense(units=2,activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 64)           5602816   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 5,712,034
Trainable params: 5,712,034
Non-trainable params: 0
_________________________________________________________________


In [12]:
history=model.fit(X_train,y_train,batch_size=128,epochs=2)

Epoch 1/2
Epoch 2/2


In [82]:
model.evaluate(X_test,y_test)



[0.2834964096546173, 0.8840606212615967]

In [38]:
#Saving Tokenizer & the model

In [39]:
import json

In [40]:
tokenizer_json=tokenizer.to_json()
with open('./logs/tokenizer.json','w') as f:
    json.dump(tokenizer_json,f)

In [79]:
model.save('./logs/model.h5')