In [1]:
from IPython.display import display, set_matplotlib_formats
from collections import Counter
from itertools import chain
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from keras.callbacks import ModelCheckpoint
import pickle
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
import gensim,keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re,nltk
import seaborn as sns
import warnings
!pip install texttable
from texttable import Texttable  
set_matplotlib_formats('svg')
warnings.filterwarnings('ignore')
nltk.download('stopwords')
size_embedding = 100
windows = 2
min_count = 1
maxlen = 500

Collecting texttable
  Downloading https://files.pythonhosted.org/packages/06/f5/46201c428aebe0eecfa83df66bf3e6caa29659dbac5a56ddfd83cae0d4a4/texttable-1.6.3-py2.py3-none-any.whl
Installing collected packages: texttable
Successfully installed texttable-1.6.3
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    to_remove = ['•', '!', '"', '#', '”', '“', '$', '%', '&', "'", '–', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '…']
    stop_words.update(to_remove)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub('\[[^]]*\]', '', text)
    text = (" ").join([word for word in text.split() if not word in stop_words])
    text = "".join([char for char in text if not char in to_remove])
    return text


In [3]:
def createTabel(Test_loss,Test_accu,Train_loss,Train_accu):
  rows = [["Parameters", "Accuracy", "Loss"], ["Training", Train_accu, Train_loss], ["Testing", Test_accu, Test_loss]]
  table = Texttable()
  table.add_rows(rows)
  print(table.draw())

In [4]:
def plot_loss_epochs(history):
    epochs = np.arange(1,len(history.history['accuracy']) + 1,1)
    train_acc = history.history['accuracy']
    train_loss = history.history['loss']
    val_acc = history.history['val_accuracy']
    val_loss = history.history['val_loss']

    fig , ax = plt.subplots(1,2, figsize=(7,3))
    ax[0].plot(epochs , train_acc , '.-' , label = 'Train Accuracy')
    ax[0].plot(epochs , val_acc , '.-' , label = 'Validation Accuracy')
    ax[0].set_title('Train & Validation Accuracy')
    ax[0].legend()
    ax[0].set_xlabel("Epochs")
    ax[0].set_ylabel("Accuracy")

    ax[1].plot(epochs , train_loss , '.-' , label = 'Train Loss')
    ax[1].plot(epochs , val_loss , '.-' , label = 'Validation Loss')
    ax[1].set_title('Train & Validation Loss')
    ax[1].legend()
    ax[1].set_xlabel("Epochs")
    ax[1].set_ylabel("Loss")
    fig.tight_layout()
    fig.show()

In [5]:
#convert the glove.6B.50d.txt data to dictionary
def to_word_vector_matrix():
  gloveVectors = dict()
  glove=open(r"/content/drive/MyDrive/Colab Notebooks/ML/glove.6B.100d.txt",encoding='utf-8')
  for line in glove:
    values=line.split()
    word=values[0]
    vectors=np.asarray(values[1:],dtype=np.float64)
    gloveVectors[word]=vectors

  glove.close()
  return gloveVectors

In [7]:
dataset=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/train.csv')
dataset['Text']=dataset['title']+" "+dataset['text']
dataset = dataset.drop(columns = ['title', 'text','author'])
dataset['Text'].fillna("NA", inplace=True)

In [22]:
dataset['Text'] = dataset['Text'].apply(clean_text)
text_train=dataset['Text']
y_train=dataset['label']

In [23]:
glove_vectors=to_word_vector_matrix()
text_train_lis=text_train.tolist()
token = Tokenizer()
token.fit_on_texts(text_train_lis)
vocab_size=len(token.word_index)+1

In [24]:
embed_vector_len = glove_vectors['the'].shape[0]
word_vector_matrix = np.zeros((vocab_size,embed_vector_len))
unknownWords=[]
for word, index in token.word_index.items():
  vector = glove_vectors.get(word)
  if vector is not None:
    word_vector_matrix[index]=vector
  else:
    unknownWords+=[word]


In [25]:
text_train_tok = token.texts_to_sequences(text_train) #to sequences of numbers
text_train_tok_pad = pad_sequences(text_train_tok, maxlen=maxlen, padding='post') #padding so all inputs will be of same length

In [31]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_vector_len,weights=[word_vector_matrix], input_length=maxlen, trainable=False))
model.add(LSTM(units=32))
model.add(Dense(1, activation='sigmoid'))

In [32]:
model.summary()
checkpoint = ModelCheckpoint("best_weights.hdf5", monitor='val_accuracy', verbose=0,save_best_only=True, mode='auto', period=1,save_weights_only=False) #checkpoint at best epoch
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(text_train_tok_pad, y_train, validation_split=0.05,callbacks=[checkpoint], epochs=15, batch_size = 64, verbose = 1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          22069300  
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 22,086,357
Trainable params: 17,057
Non-trainable params: 22,069,300
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [16]:
y_train.shape

(16640,)

In [33]:
dfTest=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/test.csv')
dfTest['Text']=dfTest['title']+" "+dfTest['text']
dfTest = dfTest.drop(columns = ['title', 'text','author'])
dfTest['Text'].fillna("NA", inplace=True)
dfTest['Text'] = dfTest['Text'].apply(clean_text)
text_test=dfTest['Text']

text_test_tok = token.texts_to_sequences(text_test)
text_test_tok_pad = pad_sequences(text_test_tok, maxlen=maxlen, padding='post')  
# model.load_weights("/content/best_weights.hdf5")
predicted = (model.predict(text_test_tok_pad) > 0.5).astype("int32")

In [34]:
id=dfTest['id']
ll=predicted.tolist()
import csv
row_list=[['id','label']]
for i in range(len(id)):
  row_list+=[[id[i],ll[i][0]]]

filename='submission_Glove_LSTM.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)