# Step 1: Import required libraries.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Step 2: Read Train data (Descriptive and Exploratory analysis)

In [None]:
train_data=pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
train_data.info()

In [None]:
train_data

## Visualizing the number of words in sentences


In [None]:
word_len=[]
for i in range(len(train_data)):
    word_len.append(len(train_data.text.values[i].split(' ')))    

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(word_len)
plt.xlabel("Word lengths:")
plt.ylabel('Counts:')
plt.title('Train Data \n Max length='+str(max(word_len)))
plt.show()

## Visualizing the number of instances in target field

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(train_data.target)
plt.title('Count for Zeros:'+str(train_data.target.value_counts()[0])+'\n'+
         'Count for Ones:'+str(train_data.target.value_counts()[1]))
plt.show()

# Step 3: Pre-processing

## As we can see there are 2 columns that contain null values (keyword and location), we will drop them since anyways we will only use the text and target columns

In [None]:
train_data=train_data.drop('keyword',axis=1)
train_data=train_data.drop('location',axis=1)

## Define X_train and Y_train data.

In [None]:
Y_train=train_data.target
X_train=train_data.text

## Reshaping Y_train so that it's easier to process it when using LSTM.

In [None]:
Y_train=tf.reshape(Y_train,(-1,1))

In [None]:
Y_train

## Tokenizing the train text data

In [None]:
max_words = 100000
max_len = 100

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
sequences_matrix[1729]

# Step 4: Creating RNN Model 

In [None]:
model=tf.keras.Sequential()

model.add(tf.keras.layers.Input(shape=[max_len]))
model.add(tf.keras.layers.Embedding(max_words,128,input_length=max_len))    

model.add(tf.keras.layers.LSTM(200, return_sequences=True))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.LSTM(200,return_sequences=True))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.LSTM(200))
model.add(tf.keras.layers.Dropout(0.5))
          
model.add(tf.keras.layers.Dense(256))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(1,activation='sigmoid')) #output layer

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])

# Step 5: Training the model

In [None]:
hist=model.fit(sequences_matrix,Y_train,batch_size=64,epochs=30)

# Step 6: Metrics and results

## Train Accuracy

In [None]:
model.evaluate(sequences_matrix,Y_train)

## Visualizing Train loss and accuracy epoch wise

In [None]:
plt.plot(hist.history['loss'],'g')
plt.xlabel('Epochs:')
plt.ylabel('Loss:')
plt.show()

In [None]:
#accuracy

plt.plot(hist.history['acc'],'r')
plt.xlabel('Epochs:')
plt.ylabel('Accuracy:')
plt.show()

# Step 7: Prediction on test data

In [None]:
test_data=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
X_test=test_data.text

In [None]:
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_test)
sequences_test = tok.texts_to_sequences(X_test)
sequences_matrix_test = sequence.pad_sequences(sequences_test,maxlen=max_len)

## Predict

In [None]:
pred=model.predict(sequences_matrix_test)

## This step is required because i have used sigmoid activation function at the output layer. 

In [None]:
pred=(pred>0.5)*1

## Submission

In [None]:
p=pd.DataFrame()
p['id']=test_data['id']
p['target']=pred

In [None]:
p.to_csv('./Submission_sachin.csv',index=False)