In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Loading Dataset**

In [None]:
train_data=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_data.head()

In [None]:
test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_data.head()

**EDA**

In [None]:
sns.countplot(train_data['target'])
plt.title('Count for Zeros:'+str(train_data.target.value_counts()[0])+'\n'+
         'Count for Ones:'+str(train_data.target.value_counts()[1]))
plt.show()

In [None]:
len_sent=[]
for i in range(len(train_data['text'])):
    len_sent.append(len(train_data['text'][i].split(' ')))

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(len_sent)
plt.xlabel("Word lengths:")
plt.ylabel('Counts:')
plt.title('Train Data \n Max length='+str(max(len_sent)))
plt.show()

In [None]:
train_data.drop(['keyword','location'],axis=1,inplace=True)
test_data.drop(['keyword','location'],axis=1,inplace=True)

In [None]:
sent=''
for i in range(len(train_data)):
    sent=sent+train_data['text'][i]

In [None]:
from wordcloud import WordCloud
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(sent)
plt.figure(figsize=(10,10))
plt.imshow(word_cloud2, interpolation='bilinear')

plt.axis("off")

plt.show()

**Data Cleaning**

Since there are a lot of words in the wordcloud that have no significant meaning so we will remove unnecessary words and other things from the corpus, allowing better training of the models. 

In [None]:
import nltk
from nltk.corpus import stopwords
import re

In [None]:
stop_words=set(stopwords.words('english'))

def text_cleaner(text):
    newString=text.lower()
    #remove hyperlinks
    newString=re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', newString)
     #removing text inside ()
    newString = re.sub(r'\([^)]*\)', '', newString)
    #removing text inside []
    newString = re.sub(r'\{[^)]*\}', '', newString)
    #fetching alphabetic characters
    newString = re.sub("[^a-zA-Z]", " ", newString)
    #removing stop words
    tokens = [w for w in newString.split() if not w in stop_words] 
    long_words=[]
    for i in tokens:
        #removing short words
        if len(i)>=4:                                                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [None]:
cleaned_text_train=[]
for i in train_data['text']:
    cleaned_text_train.append(text_cleaner(i))

In [None]:
print("Before cleaning:\n")
print(train_data['text'][0]+"\n")
print("After cleaning:\n")
print(cleaned_text_train[0])

In [None]:
sent_1=''
for i in range(len(cleaned_text_train)):
    sent_1=sent_1+cleaned_text_train[i]
    
from wordcloud import WordCloud
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(sent_1)
plt.figure(figsize=(10,10))
plt.imshow(word_cloud2, interpolation='bilinear')

plt.axis("off")

plt.show()    

**As one can observe the wordcloud has a lot better corpus with words having meaning. Thus the cleaning part was successful.**

In [None]:
len_0=[]
for i in range(len(cleaned_text_train)):
    if len(cleaned_text_train[i])==0:
        len_0.append(i)
len_0        

In [None]:
cleaned_text_test=[]
for i in test_data['text']:
    cleaned_text_test.append(text_cleaner(i))

In [None]:
len_0=[]
for i in range(len(cleaned_text_test)):
    if len(cleaned_text_test[i])==0:
        len_0.append(i)
len_0    

In [None]:
len_sent_train=[]
for i in range(len(cleaned_text_train)):
    len_sent_train.append(len(cleaned_text_train[i].split(' ')))

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(len_sent_train)
plt.xlabel("Word lengths:")
plt.ylabel('Counts:')
plt.title('Train Data \n Max length='+str(max(len_sent_train)))
plt.show()

In [None]:
len_sent_test=[]
for i in range(len(cleaned_text_test)):
    len_sent_test.append(len(cleaned_text_test[i].split(' ')))

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(len_sent_test)
plt.xlabel("Word lengths:")
plt.ylabel('Counts:')
plt.title('Train Data \n Max length='+str(max(len_sent_test)))
plt.show()

**Model Training**
1. Splitting the dataset into train, validation data
2. Tokeninzing the train,validation and test data
3. Padding the tokens
4. Creating four different models RNN, LSTM, GRU and Stacked LSTM

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(cleaned_text_train,train_data['target'],test_size=0.3,random_state=40)
print(len(X_train),len(y_train))
print(len(X_val),len(y_val))

In [None]:
max_len=20

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train=tokenizer.texts_to_sequences(X_train)
X_val=tokenizer.texts_to_sequences(X_val)
X_test=tokenizer.texts_to_sequences(cleaned_text_test)
X_train=pad_sequences(X_train,maxlen=max_len,padding='post')
X_val=pad_sequences(X_val,maxlen=max_len,padding='post')
X_test=pad_sequences(X_test,maxlen=max_len,padding='post')

In [None]:
vocab=len(tokenizer.word_index)+1
print("Vocab Size",vocab)

In [None]:
from keras.utils.np_utils import to_categorical
y_train=to_categorical(y_train,num_classes=2)
y_val=to_categorical(y_val,num_classes=2)

In [None]:
print(y_train.shape)
print(y_val.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding,GRU, LSTM, RNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
K.clear_session()

model=Sequential()
model.add(Embedding(vocab,100,input_length=max_len,trainable=True,mask_zero=True))
model.add(LSTM(300,dropout=0.1,recurrent_dropout=0.2))
model.add(Dense(64,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [None]:
history=model.fit(x=np.array(X_train),y=np.array(y_train),batch_size=1200,epochs=30,
          validation_data=(np.array(X_val),np.array(y_val)))

In [None]:
plt.plot(history.history['val_loss'],'r',label='val_loss')
plt.plot(history.history['loss'],'b',label='train_loss')
plt.legend()

In [None]:
plt.plot(history.history['val_acc'],'r',label='val_acc')
plt.plot(history.history['acc'],'b',label='train_acc')
plt.legend()

In [None]:
model2=Sequential()
model2.add(Embedding(vocab,100,input_length=max_len,trainable=True,mask_zero=True))
model2.add(GRU(300,dropout=0.1,recurrent_dropout=0.2))
model2.add(Dense(64,activation='relu'))
model2.add(Dense(2,activation='softmax'))
model2.summary()

In [None]:
model2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [None]:
history2=model2.fit(x=np.array(X_train),y=np.array(y_train),batch_size=1200,epochs=30,
          validation_data=(np.array(X_val),np.array(y_val)))

In [None]:
plt.plot(history2.history['val_loss'],'r',label='val_loss')
plt.plot(history2.history['loss'],'b',label='train_loss')
plt.legend()

In [None]:
plt.plot(history2.history['val_acc'],'r',label='val_acc')
plt.plot(history2.history['acc'],'b',label='train_acc')
plt.legend()

In [None]:
model3=Sequential()
model3.add(Embedding(vocab,100,input_length=max_len,trainable=True,mask_zero=True))
model3.add(LSTM(300,dropout=0.1,recurrent_dropout=0.2))
model3.add(Dense(64,activation='relu'))
model3.add(Dense(2,activation='softmax'))
model3.summary()

In [None]:
model3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [None]:
history3=model3.fit(x=np.array(X_train),y=np.array(y_train),batch_size=1200,epochs=30,
          validation_data=(np.array(X_val),np.array(y_val)))

In [None]:
plt.plot(history3.history['val_loss'],'r',label='val_loss')
plt.plot(history3.history['loss'],'b',label='train_loss')
plt.legend()

In [None]:
plt.plot(history3.history['val_acc'],'r',label='val_acc')
plt.plot(history3.history['acc'],'b',label='train_acc')
plt.legend()

In [None]:
model4=Sequential()
model4.add(Embedding(vocab,100,input_length=max_len,trainable=True,mask_zero=True))
model4.add(LSTM(300,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model4.add(LSTM(100,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model4.add(LSTM(50,dropout=0.1,recurrent_dropout=0.2))
model4.add(Dense(64,activation='relu'))
model4.add(Dense(2,activation='softmax'))
model4.summary()

In [None]:
model4.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
history4=model4.fit(x=np.array(X_train),y=np.array(y_train),batch_size=120,epochs=30,
          validation_data=(np.array(X_val),np.array(y_val)))

In [None]:
plt.plot(history4.history['val_loss'],'r',label='val_loss')
plt.plot(history4.history['loss'],'b',label='train_loss')
plt.legend()

In [None]:
plt.plot(history3.history['val_acc'],'r',label='val_acc')
plt.plot(history3.history['acc'],'b',label='train_acc')
plt.legend()

In [None]:
predict=model4.predict(X_test)

In [None]:
predict_final=[0 if i[0]>=0.5 else 1 for i in predict]
test_data['target']=predict_final

In [None]:
submission=test_data[['id','target']]
submission.to_csv('Submission.csv',index=False)