## **Importing dependancies**

In [None]:
!pip install ekphrasis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string #string operation

#For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

#For NLP Task
import re
from ekphrasis.dicts.noslang.slangdict import slangdict
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

#For Model Building
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional,SpatialDropout1D
from tensorflow.keras.optimizers import Adam

#For Wordcloud
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
from PIL import Image

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

**Reading and preparation of data**

In [None]:
df.head()

In [None]:
df.target.unique()

In [None]:
df.info()

In [None]:
df['text'].isnull().sum()

## **Data Visualization**

In [None]:
sns.countplot(x="target", data=df,palette="Set3")
plt.title('Count of target feature')

In [None]:
disaster_len = df[df['target']==1]['text'].str.split().map(lambda x : len(x))
non_disaster_len = df[df['target']==0]['text'].str.split().map(lambda x : len(x))
data={'Disaster_Tweets_Length':disaster_len,
     'Non_Disaster_Tweets_Length':non_disaster_len}
ndf=pd.concat(data,axis=1)

**Tweet Lengths**

In [None]:
plt.figure(figsize=(10,6))
for i, column in enumerate(ndf.columns, 1):
    plt.subplot(1,2,i)
    sns.histplot(ndf[column])

## **Data Cleaning**

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is ", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"it's", " it is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", "  are", text)
    text = re.sub(r"\'d", "  would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"[-@#$%^&*()_'+\/<>?:;|=~,.]", "", text)
    
    return text

Slang Dictinory from ekphrasis

In [None]:
slang_dict = {}

for key, value in slangdict.items():
    slang_dict[key.lower()] = value.lower()
print('Total Slang words count:', len(slang_dict))

temp =  list(slang_dict.items())
print("\nFirst 15 Slang word and it's definitions \n")
for i in range(15):
    print(temp[i])

In [None]:
def replace_slang(txt, slang):
    new_txt = re.sub(r'\s+', ' ', txt)
    slang_conversion = []
    for tok in new_txt.split():
        if tok.lower() in slang:
            slang_conversion.append(slang[tok.lower()])
        else:
            slang_conversion.append(tok)
    slang_conversion = ' '.join(slang_conversion)
    
    return slang_conversion.strip()

## **Extra Data Exploration and Analysis with Cleaned Text**

Ekphrasis is a text processing tool, geared towards text from social networks, such as Twitter or Facebook. Ekphrasis performs tokenization, word normalization, word segmentation (for splitting hashtags) and spell correction, using word statistics from 2 big corpora (english Wikipedia, twitter - 330mil english tweets).

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
def preprocess(txt):
    
    #replace slang with it's defination
    text = replace_slang(txt, slang_dict)
    
    # Removing Contraction i.e I'll--> I will
    clean_txt = clean_text(text) 
    
    # Extract emojis and hashtags and segment the txt
    clean_txt = ' '.join(text_processor.pre_process_doc(clean_txt)).strip()
    for patt in [r"<elongated>", r"<repeated>"]:
        clean_txt = re.sub(patt, '', clean_txt)
    
    #replacing slangs again after extracting emojis, hashtags and segmention
    clean_txt = replace_slang(clean_txt, slang_dict)
    
    # remove punctuations
    clean_txt = re.sub(r'[%s]' % re.escape(''.join(string.punctuation)), r' ',clean_txt)
    
    # lower case
    clean_txt = clean_txt.lower()
    
    return clean_txt.strip()
    

In [None]:
train_df=df[['text','target']].copy()
train_df['Tweets'] = train_df['text'].apply(preprocess)
test_df['Tweets'] = test_df['text'].apply(preprocess)

**Cleaned Tweets**

In [None]:
for old_tweeets,new_tweets in train_df[['text','Tweets']].values[10:20]:
    print('%s\n%s\n'%(old_tweeets ,new_tweets))

**Word Cloud**

In [None]:
def red_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    color = '#ff0000'    
    return color

def green_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    color = '#00ff00'    
    return color

logo = np.array(Image.open('../input/twitter-logo/twitter-logo-clipart-black-5.png'))

In [None]:
dis_wc = WordCloud(stopwords=STOPWORDS,  
               background_color="Black",mask=logo).generate(' '.join(train_df[train_df.target==1]['Tweets']))
image_colors = ImageColorGenerator(logo)
dis_wc.recolor(color_func=red_color_func, random_state=3)

non_dis_wc = WordCloud(stopwords=STOPWORDS,  
               background_color="Black",mask=logo).generate(' '.join(train_df[train_df.target==0]['Tweets']))
image_colors = ImageColorGenerator(logo)
non_dis_wc.recolor(color_func=green_color_func, random_state=3)

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(20, 12))

ax1.imshow(dis_wc)
ax1.set_title("Word cloud for disaster tweets", fontsize=20)
ax1.axis("off")

ax2.imshow(non_dis_wc)
ax2.set_title("Word cloud for non disaster tweets", fontsize=20)
ax2.axis("off")

fig.show()

In [None]:
embedding_dim = 16
max_length = 25
training_size = 0.8

In [None]:
sentences = train_df['Tweets']
target = train_df['target']

In [None]:
size = int(training_size * train_df.shape[0])
training_text = sentences[0:size]
testing_text = sentences[size:]
training_target = target[0:size]
testing_target = target[size:]

In [None]:
print("Shape of Training set: ",training_text.shape)
print("Shape of Testing set",testing_text.shape)

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences,padding='post', maxlen=max_length)

testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences,padding='post', maxlen=max_length)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
training_padded = np.array(training_padded)
training_labels = np.array(training_target)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_target)

## **Modeling**

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
num_epochs = 12

history = model.fit(training_padded, training_target, epochs=num_epochs,batch_size=16, validation_data=(testing_padded, testing_target),verbose=1)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(training_padded, training_target)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(testing_padded, testing_target)[1]*100)