In [1]:
import cv2
import numpy as np
import pickle
import pandas as pd
import scipy as spy
from skimage import color
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input,GlobalMaxPool1D,Dropout
from keras.utils import plot_model

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score

Using TensorFlow backend.


In [2]:
# Load train data
train = pd.read_csv('./../Dataset/csv/train.csv')
#print(train.head())

# load test
test = pd.read_csv('./../Dataset/csv/test.csv')
#print(test.head())

In [3]:
#drop the Nan Values
#print(train.isnull().sum())

# check na in test
#print(test.isnull().sum())

In [4]:
# replace na
train['text'] = train['text'].replace(np.nan, train['title'])
#print(train.isnull().sum())

# Replace na
test['text'] = test['text'].replace(np.nan, test['title'])
#print(test.isnull().sum())

In [5]:
#Get the Depndent feature
X_train=train.drop('label',axis=1)
y_train=train['label']
X_test=test

In [6]:
# set vocabulary size
vo_size=500
messages=X_train.copy()
messages.reset_index(inplace=True)

messages_test=X_test.copy()
messages_test.reset_index(inplace=True)

In [None]:
#dataset Preprocessing

#Train Dataset
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print("Status: %s / %s" %(i, len(messages)), end="\r")
    review = re.sub('[^a-zA-Z]', ' ',messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
print("\n")

#Test Dataset
ps_test = PorterStemmer()
corpus_test = []
for i in range(0, len(messages_test)):
    print("Status: %s / %s" %(i, len(messages_test)), end="\r")
    review = re.sub('[^a-zA-Z]', ' ',messages_test['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps_test.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

In [None]:
# one hot representation
onehot_rep = [one_hot(words, vo_size) for words in corpus]
onehot_rep_test = [one_hot(words, vo_size) for words in corpus_test]

In [None]:
# pad_sequences
sent_length = 1000
embedded_doc=pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)
embedded_doc_test=pad_sequences(onehot_rep_test, padding='pre', maxlen=sent_length)
print(embedded_doc)
print(embedded_doc_test)

In [None]:
pickle.dump(embedded_doc, open("../Dataset/embedded_doc.pickle", "wb"))
pickle.dump(embedded_doc_test, open("../Dataset/embedded_doc_test.pickle", "wb"))

In [11]:
sent_length = 1000
embedded_doc = pickle.load(open("../Dataset/embedded_doc.pickle", "rb"))
embedded_doc_test = pickle.load(open("../Dataset/embedded_doc_test.pickle", "rb"))

In [12]:
# final data for NN
X_final=np.array(embedded_doc)
y_final=np.array(y_train)
X_final_test=np.array(embedded_doc_test)
#print(X_final.shape,y_final.shape,X_test_final.shape)

In [16]:
# model-1
embedding_vector_feature = 128
model=Sequential()
#Embedding Layer
model.add(Embedding(vo_size,embedding_vector_feature,input_length=sent_length))
#Encoder-Decoder Layer
model.add(LSTM(64))

model.add(Dense(1,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

ValueError: Input 0 of layer lstm_5 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 64]

In [None]:
# model-2
embedding_vector_feature = 128
model=Sequential()
#Embedding Layer
model.add(Embedding(vo_size,embedding_vector_feature,input_length=sent_length))
model.add(LSTM(64))
#DNN Layer
model.add(Dense(32,activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
# Plot title model
plot_model(model, to_file='lstm_model_1_plot.png', show_shapes=True, show_layer_names=True)
model_img = cv2.imread("./lstm_1model_1_plot.png", 0)

In [14]:
# train model 1
history = model.fit(X_final,y_final, validation_split=0.2, epochs=10, batch_size=256)

Train on 16640 samples, validate on 4160 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


KeyboardInterrupt: 

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Predict Test
y_pred_final = model.predict_classes(X_test_final)
y_pred_final = pd.DataFrame(y_pred_final)

In [None]:
# Save model
model.save_weights("lstm_model_1.h5")