In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np 
import pandas as pd

import re
import spacy 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM, Dense, Dropout, Activation
from tensorflow.keras.callbacks import EarlyStopping 

## Data Acquisition

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
print(df_test.shape)
df_test.head()

In [None]:
df_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

## Data Preprocessing

In [None]:
df_train['target'].value_counts()

In [None]:
print(df_train.isnull().sum(), '\n')
print(df_test.isnull().sum())

In [None]:
df_train['keyword'].fillna('', inplace=True)
df_test['keyword'].fillna('', inplace=True)

In [None]:
df_train['text'] = df_train['text'] + ' ' + df_train['keyword']
df_test['text'] = df_test['text'] + ' ' + df_test['keyword']

In [None]:
df_train.drop(['keyword','location'], axis=1, inplace=True)
df_test.drop(['keyword','location'], axis=1, inplace=True)

In [None]:
duplicate_records = df_train[df_train.duplicated(['text'], keep=False)]
print(duplicate_records.shape)
duplicate_records.head()

In [None]:
df_train.drop_duplicates(subset=['text'], keep='first', inplace=True)

In [None]:
def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    
    text = [PorterStemmer().stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: preprocess(x))
df_test['text'] = df_test['text'].apply(lambda x: preprocess(x))
df_train.head()

In [None]:
voc_size = 50000
sent_length = 30
embedding_dim = 300

X_train = [one_hot(words, voc_size) for words in df_train['text']]
X_train = pad_sequences(X_train, padding='pre', maxlen=sent_length)
Y_train = df_train['target']
X_test = [one_hot(words, voc_size) for words in df_test['text']]
X_test = pad_sequences(X_test, padding='pre', maxlen=sent_length)

print(X_train.shape, Y_train.shape)
print(X_test.shape)

## Model Development

In [None]:
model = Sequential()
model.add(Embedding(voc_size, embedding_dim, input_length=sent_length))
model.add(Dropout(0.2))

model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Activation('softmax'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_history = model.fit(X_train, Y_train, validation_split=0.1, batch_size=64, epochs=10, 
                          shuffle=True, callbacks=[early_stop])

In [None]:
prediction = model.predict(X_test)
prediction = list(1 if x>0.5 else 0 for x in list(prediction))
prediction[0:10]

## Submission

In [None]:
df_submission['target'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()

In [None]:
df_submission['target'].value_counts()