### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

import re
import string
import emoji

import warnings
warnings.filterwarnings('ignore')

### Load Data

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')


In [None]:
train_data.head()

In [None]:
print("Total rows in train data: ",train_data.shape[0])
print("Total columns in train data: ",train_data.shape[1])
print("-"*30)
print("Total rows in test data: ",test_data.shape[0])
print("Total columns in test data: ",test_data.shape[1])




**Checking for null data. Since keyword and location columns has null value i am going to drop them**

In [None]:
print(train_data.isnull().sum())
print("-"*30)
print(test_data.isnull().sum())


### Counting and visualizing total positive and negative target in our training data

In [None]:
train_data.groupby(train_data.target).count().text

In [None]:
sns.countplot(train_data.target,data = train_data)
plt.show()

#### Here, I am going to use only text column for analysis so I am dropping other columns

In [None]:
train_data = train_data.iloc[:,3:]
test_data = test_data.iloc[:,3:]


## Cleaing tweets
Cleaning process involves removal of emojis,hyperlink,punctuations and many more.

In [None]:

    
def cleanTweet(txt):
    txt = re.sub(r'@[A-Za-z0-9_]+','',txt)
    txt = re.sub(r'#','',txt)
    txt = re.sub(r'RT : ','',txt)
    txt = re.sub(r'\n','',txt)
    # to remove emojis
    txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+','',txt)
    txt = re.sub(r"https?://\S+|www\.\S+","",txt)
    txt = re.sub(r"<.*?>","",txt)
    return txt  

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def make_Lower(text):
    return str.lower(text)

In [None]:
print("Tweet before cleaning:: ",train_data.text[200])
print("Tweet after cleaning:: ",train_data.text.apply(cleanTweet)[200])

In [None]:
text = "#@Hello &World"
print(remove_punct(text))

In [None]:
text = 'HELLO WORLD '
print(make_Lower(text))

In [None]:
train_data.text = train_data.text.apply(cleanTweet)
train_data.text = train_data.text.apply(remove_punct)
train_data.text = train_data.text.apply(make_Lower)

test_data.text = test_data.text.apply(cleanTweet)
test_data.text = test_data.text.apply(remove_punct)
test_data.text = test_data.text.apply(make_Lower)

In [None]:
!pip install pyspellchecker

In [None]:

from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "speling mistke"
correct_spellings(text)

In [None]:
# You can use this function to correct spellings 
# train_data.text = train_data.text.apply(correct_spellings)
# test_data.text = test_data.text.apply(correct_spellings)

**Removing Stopwords from data**

In [None]:
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

In [None]:
train_data.text = train_data.text.apply(remove_stopwords)
test_data.text = test_data.text.apply(remove_stopwords)


**Defining hyperparameters**

In [None]:
vocab = 20000
oov = '<OOV>'
embedding = 32
padding = 'post'
truncate = 'post'


In [None]:
# import necessary libraries for text preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [None]:
# preparing training data
train_ = train_data.drop('target',axis = 1)
train_y = train_data.target

**Here I have splitted training data to train and validation data to check the performance of model**

In [None]:

train_size = 0.8
size = int(train_size * train_data.shape[0])
train_x_data = train_.iloc[:size,:]
train_y_data =  train_y.iloc[:size]
print("Shape of X train data: ",train_x_data.shape)
print("Shape of Y train data",train_x_data.shape)

validation_x_data = train_.iloc[size:,:]
validation_y_data =  train_y.iloc[size:]
print("Shape of X validation data: ",validation_x_data.shape)
print("Shape of Y validation data",validation_x_data.shape)


In [None]:
tokenizer = Tokenizer(num_words = vocab, oov_token = oov)
tokenizer.fit_on_texts(train_x_data.text)
word_index = tokenizer.word_index
print("Length: ",len(word_index))


In [None]:
training_x = tokenizer.texts_to_sequences(train_x_data.text)
training_x_pad = pad_sequences(training_x,maxlen=25, padding=padding, truncating=truncate)

validation_x = tokenizer.texts_to_sequences(validation_x_data.text)
validation_x_pad = pad_sequences(validation_x,maxlen=25, padding=padding, truncating=truncate)

training_y = train_y_data.values
validation_y = validation_y_data.values

## Defining our model

In [None]:
import tensorflow as tf
import keras
from keras import layers 

def give_model():
    model = keras.models.Sequential()
    model.add(layers.Embedding(vocab, embedding, input_length=25))
    model.add(layers.Bidirectional(layers.LSTM(128,return_sequences = True)))
    model.add(layers.LSTM(64))
    model.add(layers.Dropout(0.2))
    model.add(layers.Flatten())
    model.add(layers.Dense(units = 10,activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(units = 1, activation = 'sigmoid'))
    
    
    return model
model = give_model()
model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',patience = 2, mode = 'min', min_delta = 0.01)
LOSS = tf.keras.losses.BinaryCrossentropy()
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate = 0.001)
EPOCHS = 30
VALIDATION_DATA = (validation_x_pad,validation_y)
model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = ['accuracy'])

In [None]:
history = model.fit(training_x_pad, training_y, epochs = EPOCHS, validation_data = VALIDATION_DATA,
                   callbacks = [early_stopping])

#### Visualizing our model performance

In [None]:

plt.subplot(2,1,1)
plt.plot( history.history['loss'], label = 'loss')
plt.plot( history.history['val_loss'], label = 'val_loss')
plt.legend(loc = 'best')
plt.subplot(2,1,2)
plt.plot( history.history['accuracy'], label = 'accuracy')
plt.plot( history.history['val_accuracy'], label = 'val_accuracy')
plt.legend(loc = 'best')
plt.show()

## Training our model in all train data

In [None]:

tokenizer = Tokenizer(num_words = vocab, oov_token = oov)
tokenizer.fit_on_texts(train_.text)
word_index = tokenizer.word_index
print("Length: ",len(word_index))

training_x = tokenizer.texts_to_sequences(train_.text)
training_x_pad = pad_sequences(training_x,maxlen=25, padding=padding, truncating=truncate)

testing_x_data = tokenizer.texts_to_sequences(test_data.text)
testing_x_pad = pad_sequences(testing_x_data,maxlen=25, padding=padding, truncating=truncate)

training_y = train_y.values



### Fitting our data 

In [None]:
model.fit(training_x_pad, training_y, epochs = 3)

## For submission

In [None]:
prediction = model.predict(testing_x_pad)
predicted_value = (prediction > 0.5).astype(int)

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission.head()

In [None]:
submission.target = predicted_value
submission.head()

# Thank You 
## Feel free to  comment