In [None]:
import pandas as pd
import numpy as np 

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_id = train['id']
test_id = test['id']

In [None]:
train.drop(columns = ['id'], inplace = True)
test.drop(columns = ['id'], inplace = True)

Location column has a lot of missing values and keyword column contains information that is present in the text column and we will be exracting that information down the line from the text column

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.drop(columns = ['keyword','location'], inplace = True)
test.drop(columns = ['keyword','location'], inplace = True)

In [None]:
train.head()

## Data Cleaning

In [None]:
# Converting all text to lowercase
train['text'] = [t.lower() for t in train['text']]
test['text'] = [t.lower() for t in test['text']]

In [None]:
# Removing punctuations
import re
import string
train['text'] = [re.sub('[%s]' % re.escape(string.punctuation), '', i) for i in train['text']]
test['text'] = [re.sub('[%s]' % re.escape(string.punctuation), '', i) for i in test['text']]

In [None]:
# Removing numeric characters
train['text'] = [re.sub('\d','',n) for n in train['text']]
test['text'] = [re.sub('\d','',n) for n in test['text']]

## Preprocessing Text Data

### Tokenization and Stop Words

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
# Word Tokenization

train['text'] = [word_tokenize(i) for i in train['text']]
test['text'] = [word_tokenize(i) for i in test['text']]

In [None]:
train['text'].head()

In [None]:
# Stop Words Removal

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
train['text'] = [[i for i in j if not i in stop_words] for j in train['text']]
test['text'] = [[i for i in j if not i in stop_words] for j in test['text']]

In [None]:
train.head()

### Lemmatization

In [None]:
from collections import defaultdict
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

tag_map

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

train['text'] = [[lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(i)] for i in train['text']]
test['text'] = [[lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(i)] for i in test['text']]

In [None]:
train.head()

In [None]:
train['lemmatized_text'] = train['text'].apply(lambda x : ' '.join(x))
test['lemmatized_text'] = test['text'].apply(lambda x : ' '.join(x))

In [None]:
train.head()

In [None]:
train.drop(columns = ['text'], inplace = True)
test.drop(columns = ['text'], inplace = True)

### Word Embedding using TF_IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 5000)

train_emb = tfidf.fit_transform(train['lemmatized_text']).toarray()
test_emb = tfidf.fit_transform(test['lemmatized_text']).toarray()

In [None]:
train_emb.shape[1:]

In [None]:
y = train['target']

## Model Training

### Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [None]:
MNB = MultinomialNB()

Splitting the train set in train and validation set to see how good is Naive Bayes for our data

In [None]:
x_train,x_valid,y_train,y_valid = train_test_split(train_emb,y,test_size = 0.3, random_state = 100) 

In [None]:
MNB.fit(x_train,y_train)
pred_MNB = MNB.predict(x_valid)

In [None]:
print("Accuracy score : {:.2f}".format(accuracy_score(y_valid, pred_MNB)))

In [None]:
print("ROC-AUC score : {:.2f}".format(roc_auc_score(y_valid, pred_MNB)))

In [None]:
print(classification_report(y_valid, pred_MNB))

### Model Training

In [None]:
MNB.fit(train_emb,y)

In [None]:
MNB_predictions = MNB.predict(test_emb)

In [None]:
Prediction_results = pd.DataFrame({"target": MNB_predictions}, index = test_id)

In [None]:
#submission_file = Prediction_results.to_csv('submission.csv')

#### Multinomial Naive Bayes got a 0.515 score which is decent but can be significantly improved.

### Support Vector Machines

In [None]:
from sklearn import svm
SVC = svm.SVC()
#SVC.fit(x_train,y_train)
#pred_SVC = SVC.predict(x_valid)

In [None]:
#print("Accuracy score : {:.2f}".format(accuracy_score(y_valid, pred_SVC)))

In [None]:
#print("ROC-AUC score : {:.2f}".format(roc_auc_score(y_valid, pred_SVC)))

## Alternative Approach using Sequencing, Padding, and LSTM

### Sequencing and Sentence Padding

In [None]:
from collections import Counter

# Finding the number of unique word in the corpus
def word_counter(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [None]:
train.head()

In [None]:
train_text = train.lemmatized_text
counter = word_counter(train_text)
counter

In [None]:
print("Number of unique words in the corpus : {:.2f}".format(len(counter)))

In [None]:
words = len(counter)
# maximum number of words in a sequence
max_length = 20

In [None]:
train_sent = train['lemmatized_text']
train_labels = train['target']
test_sent = test['lemmatized_text']

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=words)
tokenizer.fit_on_texts(train_sent)

In [None]:
word_index = tokenizer.word_index
word_index

In [None]:
train_sequence = tokenizer.texts_to_sequences(train_sent)

In [None]:
train_sequence[0]

In [None]:
test_sequence = tokenizer.texts_to_sequences(test_sent)

In [None]:
test_sequence

#### Sequence Padding

In [None]:
from keras.preprocessing.sequence import pad_sequences

train_padded = pad_sequences(train_sequence, maxlen = max_length, padding = "post", truncating = "post")

In [None]:
train_padded

In [None]:
test_padded = pad_sequences(test_sequence, maxlen = max_length, padding = "post", truncating = "post")

In [None]:
test_padded

### Model Building using LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam
import tensorflow as tf

def leaky_relu(z, name = None):
    return tf.maximum(0.01*z,z, name = name)

model = Sequential()

model.add(Embedding(words,32,input_length = max_length))
#model.add(LSTM(128, return_sequences = True, dropout = 0.1))
model.add(LSTM(64, dropout = 0.1))
model.add(Dense(units = 32 , activation = leaky_relu))
model.add(Dense(1, activation = tf.nn.elu))

optimizer = Adam(learning_rate = 3e-4)

model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(train_padded, train_labels, epochs = 40)

In [None]:
import h5py
#model.save('baseline_lstm_model.h5')

In [None]:
from keras.models import load_model
#model = load_model('baseline_lstm_model.h5')

In [None]:
lstm_base_pred = model.predict_classes(test_padded, verbose = 0)

In [None]:
lstm_base_pred = lstm_base_pred.reshape(-1,1).ravel()

In [None]:
len(lstm_base_pred)

In [None]:
Prediction_results_lstm = pd.DataFrame({"target":lstm_base_pred}, index = test_id)
Prediction_results_lstm

In [None]:
#submission_lstm_elu_leaky_relu = Prediction_results_lstm.to_csv('submission_lstm_elu_leaky_relu.csv')