In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# Loading Data 

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
df.head()


In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['keyword'].unique().size

In [None]:
df['location'].unique().size

In [None]:
data  = df.iloc[:,3:]

In [None]:
data.head()

# Preparing data for model trainning

In [None]:
sentences = data['text'].tolist()

In [None]:
labels = data['target'].tolist()

In [None]:
import tensorflow as tf
import tensorflow.keras as k

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max = 0
for i in sentences:
    if len(i)>max:
        max = len(i)

In [None]:
print(max)

In [None]:
vocab = 80000
oov = '<OOV>'
embedding = 32
padding = 'post'
truncate = 'post'
maxlength = max

In [None]:
ratio = 0.8*len(sentences)
ratio = int(ratio)
# print(ratio)
train = sentences[0:ratio]
train_label = labels[0:ratio]
val = sentences[ratio:]
val_labels = labels[ratio:]

In [None]:
tokenizer =Tokenizer(num_words = vocab, oov_token=oov)
tokenizer.fit_on_texts(train)
word_index = tokenizer.word_index
training = tokenizer.texts_to_sequences(train)
training_pad = pad_sequences(training, maxlen=maxlength, padding=padding, truncating=truncate)

validation = tokenizer.texts_to_sequences(val)
validation_pad = pad_sequences(validation, maxlen=maxlength, padding=padding, truncating=truncate)

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
test_sentences = test['text'].tolist()
test_sen_token = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_sen_token, maxlen=maxlength, padding=padding, truncating=truncate)


# Trainning the Model

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab, embedding, input_length=maxlength),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(10, activation='relu'),
#      tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
val_labels = np.array(val_labels)
train_label = np.array(train_label)

In [None]:
model_check = tf.keras.callbacks.ModelCheckpoint('model.h5',save_best_only = True)

In [None]:

num_epochs = 20
history = model.fit(training_pad, train_label, epochs=num_epochs, validation_data=(validation_pad, val_labels),callbacks = [model_check])

In [None]:
# Plotting Learing Curves
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
#After selecting the optimal hyperparameters, now training the model again with the entire train dataset.

tf.keras.backend.clear_session()
vocab = 120000
oov = '<OOV>'
embedding = 32
padding = 'post'
truncate = 'post'
maxlength = max

model_check = tf.keras.callbacks.ModelCheckpoint('model.h5',save_best_only = True)

tokenizer =Tokenizer(num_words = vocab, oov_token=oov)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sentences = tokenizer.texts_to_sequences(sentences)
sentences_pad = pad_sequences(sentences, maxlen=maxlength, padding=padding, truncating=truncate)
labels = np.array(labels)

# validation = tokenizer.texts_to_sequences(val)
# validation_pad = pad_sequences(validation, maxlen=maxlength, padding=padding, truncating=truncate)

num_epochs = 10
history = model.fit(sentences_pad,labels, epochs=num_epochs, verbose = 0,callbacks = [model_check])

In [None]:
model =  tf.keras.models.load_model('model.h5')

In [None]:
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sample.head()

# Generating Predictions

In [None]:
predictions = model.predict(test_pad)

In [None]:
pred = []
for i in range(0,len(predictions)):
    if predictions[i][0]>0.5:
         pred.append(1)
    else:
        pred.append(0)

In [None]:
test['target'] = pd.Series(pred)

In [None]:
test.head()

In [None]:
submit = pd.DataFrame()
submit['id'] = test['id']

In [None]:
submit['target'] = test['target']

In [None]:
submit.head()

In [None]:
submit.to_csv('Final_Submit.csv',index = False)