In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from tqdm.notebook import tqdm

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.model_selection import train_test_split
from numpy.random import seed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras import backend
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.random import set_seed
from matplotlib import pyplot as plt

tqdm.pandas()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading data
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
def remove_emojies(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def remove_urls(text):
    return re.sub(r"http\S+", "", text)

def remove_all_non_alpha(text):
    return re.sub(r'[^a-zA-Z ]+', '', text)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def lem_stem_rm_stopword(text):
    word_tokens = word_tokenize(text)
    final_sentence = []
    for word in word_tokens:
        if word in stop_words:
            continue
        preprocessed_word = lemmatizer.lemmatize(word)
        preprocessed_word = lemmatizer.lemmatize(preprocessed_word, pos="v")
        preprocessed_word = stemmer.stem(preprocessed_word)
        
        final_sentence.append(preprocessed_word)
    
    return " ".join(final_sentence)


def preprocess_sent(sent):
    sent = remove_emojies(sent)
    sent = remove_urls(sent)
    sent = sent.lower()
    sent = remove_all_non_alpha(sent)
    sent = lem_stem_rm_stopword(sent)
    return sent

In [None]:
preprocess_sent('Anith is a $good #man vist his website http:\\www.anith.com and https:\\www.anith.com')

In [None]:
# preprocessing text
train_df['preprocessed_text'] = train_df['text'].progress_map(preprocess_sent)

In [None]:
# extracting texts from data
texts = train_df['preprocessed_text']
Y = train_df['target'].values

In [None]:
## Setting hyperparameters
vocab_size = 15000
max_sent_len = 80
random_seed = 1

In [None]:
# setting random seed for numpy and tensorflow for reproduceability
seed(random_seed)
set_seed(random_seed)

In [None]:
# splitting data into training and testing
texts_train, texts_valid, y_train, y_valid = train_test_split(texts, Y, random_state = random_seed, stratify=Y)

In [None]:
# tokenizing text using keras tokenizer on training data
tknzr = Tokenizer(oov_token='<OOV>', lower=True, num_words=vocab_size)
tknzr.fit_on_texts(texts_train)

In [None]:
# generating sequences of texts
train_seqs = tknzr.texts_to_sequences(texts_train)
valid_seqs = tknzr.texts_to_sequences(texts_valid)

In [None]:
# padding and truncating se
train_seqs_pad = pad_sequences(train_seqs, padding='post', truncating='post', maxlen=max_sent_len)
valid_seqs_pad = pad_sequences(valid_seqs, padding='post', truncating='post', maxlen=max_sent_len)

In [None]:
# Model creation
model = Sequential(
[
    Embedding(vocab_size, 32),
    Bidirectional(LSTM(16, return_sequences=True)),
    Bidirectional(LSTM(16, activity_regularizer='l2')),
    Dropout(0.6),
    Dense(32, activation='relu',activity_regularizer='l2'),
    Dropout(0.6),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
]
)

In [None]:
# compiling model
model.compile(loss='binary_crossentropy', metrics=['acc'])

# Scheduling learning rate to decay as epochs progress
def scheduler(epoch, lr):
    return lr * 0.65

lr_schedule = LearningRateScheduler(scheduler)

history = model.fit(train_seqs_pad, y_train, 
          batch_size=32, epochs=20, 
          validation_data=(valid_seqs_pad, y_valid), callbacks=[lr_schedule])

In [None]:
# plotting tarining and validation accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.legend(['acc', 'val_acc'])
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.title('Accuracy trend')
plt.show()

In [None]:
# preprocessing test data
test_df['preprocessed_text'] = test_df.text.map(preprocess_sent)
test_text = test_df['preprocessed_text'].values
test_seqs = tknzr.texts_to_sequences(test_text)
test_seqs_pad = pad_sequences(test_seqs, padding='post', truncating='post', maxlen=max_sent_len)

In [None]:
# predicting and creating submission file
test_preds = model.predict_classes(test_seqs_pad)
test_preds = test_preds.reshape((len(test_preds),))
test_df['target'] = test_preds
test_df[['id', 'target']].to_csv('submission.csv', index=False)