## Importing Basic Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Dropout,LSTM,SimpleRNN,Embedding,Bidirectional,LSTM,GlobalMaxPool1D
from keras.models import Sequential

## Extracting the data

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')


In [None]:
train.info()

In [None]:
train.head(50)

In [None]:
train.isnull().sum()

We found some of the missing values in keyword and location variables and tends to be non-significant along with id for target variables.
These are not much important, so we'll be dropping them.

In [None]:
train = train.drop(['id','keyword','location'],axis = 1)
train.head(10)

In [None]:
test = test.drop(['id','keyword','location'],axis=1)
test

In [None]:
train['target'].value_counts().plot(kind = 'bar')
#sns.barplot(y = 'target',train)

In [None]:
train['target'].value_counts()

Generally, we employ the following steps while preprocessing texts:

    1.Tokenising the string
    2.Converting characters to lowercase
    3.Removing stop words and punctuations
    4.Stemming or lemmatization



Seems like we have balanced dataset. if not handle it accordingly.

In [None]:
import nltk
import re
import string
from nltk.tokenize import RegexpTokenizer,TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')


In [None]:

def RemoveUnneccasaryChar(sentence):
    for sentence1 in sentence:
        sentence1 = str(sentence1)
        sentence1 = sentence1.lower()
        formatted_sent = re.sub(r'https?:\/\/.*[\r\n]*','',sentence1) #  Remove hyperlinks
        formatted_sent = formatted_sent.replace('{html}',"")
        formatted_sent = re.sub(r'#','',formatted_sent) # Removed Hashtags
        formatted_sent = re.sub(r'[0-9]','',formatted_sent) # Removes Numbers
        formatted_sent = re.sub(r'@[A-Za-z]*','',formatted_sent) # Removed @ Tags
        
        sent.append(formatted_sent)

In [None]:
def TokenizeSentence(sentence):
    tokenizer = TweetTokenizer(preserve_case = False,strip_handles = True,reduce_len=True)
    for sentence in sent:
        tokenized_sentence = tokenizer.tokenize(sentence)
        tokenized_sent.append(tokenized_sentence)



In [None]:

def stopwordsSentence(sent):
    for sentence in sent:
        formatted_words=[]
        for word in sentence:
            if word not in stopwords_eng and word not in string.punctuation and len(word)>2:
                formatted_words.append(word)
        formatted_sent.append(formatted_words)  
    

In [None]:
def lemmatizeSentence(sent):
    lemma = WordNetLemmatizer()
    for sentence in sent:
            lemma_words = []
            for word in sentence:
                lemma_word = lemma.lemmatize(word)
                lemma_words.append(lemma_word)
            lemma_sent.append(lemma_words)   

In [None]:
def finalSentence(sentence1):
    for sentence in sentence1:
        sent = ' '.join([str(word) for word in sentence])
        final_sentence_list.append(sent)


In [None]:
sent = []
RemoveUnneccasaryChar(train['text'])

In [None]:
sentence = sent[31]
print(sentence)

In [None]:
tokenized_sent = [] # Treat every words as a individual elements
TokenizeSentence(sent)

In [None]:
tokenized_sent[1]

In [None]:
stopwords_eng = stopwords.words('english')
print('English Stop Words :\n')
print(stopwords_eng)
print('\nPunctuations  :\n')
print(string.punctuation)

In [None]:
formatted_sent = []
stopwordsSentence(tokenized_sent)

In [None]:
formatted_sent[10]




In [None]:
lemma_sent = []
lemmatizeSentence(formatted_sent)

In [None]:
lemma_sent[10]

In [None]:
final_sentence_list = []
finalSentence(lemma_sent)

In [None]:
final_sentence_list[10]

In [None]:
train['FormattedText'] = final_sentence_list
train.head()

In [None]:
train = train.drop(['text'],axis = 1)

## Converting text to a numerical vector format using tensorflow textVectorizer


In [None]:

x_train = train['FormattedText']
y_train = train['target']

In [None]:
x_train

In [None]:
x_train_array = x_train.to_numpy()
y_train_array = y_train.to_numpy()

In [None]:
x_train_array

In [None]:
y_train_array

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
tf.config.run_functions_eagerly(True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_array,y_train_array))

In [None]:
train_dataset

In [None]:
for text,label in train_dataset.take(1):
    print('Text: ',text.numpy())
    print('Label: ',label.numpy())

In [None]:
BUFFER_SIZE = 3000
BATCH_SIZE = 128

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
VOCAB_SIZE = 12000


#This layer will only be used in LSTM and GRU architectures for obtaining numerical vector representation of words. 
#For BERT we will use bert spcific vectorization technique.

encoder = tf.keras.layers.TextVectorization(max_tokens = VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text,target: text))


In [None]:
vocabulary = np.array(encoder.get_vocabulary())
vocabulary[1:10]

In [None]:
print('Original Text :' +str(text))
encoded_text = encoder(text).numpy()
print('Numeric Representaion :' +str(encoded_text))

In [None]:
len(encoder.get_vocabulary())

## Building LSTM Model

In [None]:
model = Sequential()
model.add(encoder)
model.add(Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=16,mask_zero = True))
model.add(Bidirectional(LSTM(16,return_sequences = True)))
model.add(Dropout(0.20))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.20))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(1))
model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss',factor=0.25,patience=2,min_lr=0.001)

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])
#model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset,epochs = 5,callbacks = [reduce_lr])

## Preparing Test Datasets

In [None]:
sent = []
RemoveUnneccasaryChar(test['text'])

In [None]:
tokenized_sent = []
TokenizeSentence(sent)

In [None]:
formatted_sent = []
stopwordsSentence(tokenized_sent)

In [None]:
lemma_sent = []
lemmatizeSentence(formatted_sent)

In [None]:
final_sentence_list = []
finalSentence(lemma_sent)

In [None]:
test['text'] = final_sentence_list
test

In [None]:
x_test = test['text']
x_test_array = x_test.to_numpy()

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test_array))
for test_text in test_dataset.take(2):
    print('Text: ', test_text.numpy())

In [None]:
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
y_pred = model.predict(test_dataset)

In [None]:
y_pred

In [None]:
result = []
for i in y_pred:
    if i >= 0:
        result.append(1)
    else: 
        result.append(0)

## Submission

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission

In [None]:


submission['target'] = result
submission



In [None]:
submission['target'].value_counts()

In [None]:
submission.to_csv('submission1.csv', index=False)

## If you find it helpful, Please Upvote this notebook. Thanks