**IMPORTING THE REQUIRED LIBRARIES**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

In [None]:
#libraries for text preprocessing
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import SnowballStemmer

In [None]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer,TweetTokenizer

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium 
from folium import plugins 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

In [None]:
#for plot model
from keras.utils.vis_utils import plot_model
from PIL import Image

**BASIC INFORMATION OF THE DATASET**

In [None]:
train.info()

In [None]:
train.head(10)

In [None]:
train.columns,test.columns

In [None]:
print(train.isna().sum())
print('-------------------------------')
print('Total Missing Values: ', train.isna().sum().sum())
print('-------------------------------')

In [None]:
#where most of the tweets come from
train['location'].value_counts()[:10]

**EXPLORATORY DATA ANALYSIS**

In [None]:
custom_colors = ['#000000', '#E31E33', '#4A53E1', '#F5AD02', '#94D5EA', '#F6F8F7']
custom_palette = sns.set_palette(sns.color_palette(custom_colors))
sns.palplot(sns.color_palette(custom_colors), size = 1)
plt.tick_params(axis = 'both', labelsize = 0, length = 0)

In [None]:
#graph from where most of the tweets comefrom
plt.figure(figsize = (15, 13))
ax = plt.axes()
ax.set_facecolor('black')
ax = ((train.location.value_counts())[:10]).plot(kind = 'bar', color = custom_colors[2], linewidth = 2, edgecolor = 'white')
plt.title('Location Count', fontsize = 30)
plt.xlabel('Location', fontsize = 25)
plt.ylabel('Count', fontsize = 25)
ax.xaxis.set_tick_params(labelsize = 15, rotation = 30)
ax.yaxis.set_tick_params(labelsize = 15)
bbox_args = dict(boxstyle = 'round', fc = '0.9')
for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x() + 0.15, p.get_height() + 2),
                   bbox = bbox_args,
                   color = custom_colors[2],
                   fontsize = 18)

In [None]:
#Disaster count(Whether the disaster happened or not )
plt.figure(figsize = (15, 12))
ax = plt.axes()
ax.set_facecolor('black')
ax = sns.countplot(x = 'target', data = train, palette = [custom_colors[2], custom_colors[1]], edgecolor = 'white', linewidth = 1.2)
plt.title('Disaster Count', fontsize = 25)
plt.xlabel('Disaster', fontsize = 20)
plt.ylabel('Count', fontsize = 20)
ax.xaxis.set_tick_params(labelsize = 15)
ax.yaxis.set_tick_params(labelsize = 15)
bbox_args = dict(boxstyle = 'round', fc = '0.9')
for p in ax.patches:
        ax.annotate('{:.0f} = {:.2f}%'.format(p.get_height(), (p.get_height() / len(train['target'])) * 100), (p.get_x() + 0.25, p.get_height() + 60), 
                   color = 'black',
                   bbox = bbox_args,
                   fontsize = 18)
plt.show()

In [None]:
#geomap for top 10 tweet countries
new_train = pd.DataFrame()
new_train['location'] = ((train['location'].value_counts())[:10]).index
new_train['count'] = ((train['location'].value_counts())[:10]).values
geolocator = Nominatim(user_agent = 'Rahil')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds = 0.5)
lat = {}
long = {}
for i in new_train['location']:
    location = geocode(i)
    lat[i] = location.latitude
    long[i] = location.longitude
new_train['latitude'] = new_train['location'].map(lat)
new_train['longitude'] = new_train['location'].map(long)
map = folium.Map(location = [10.0, 10.0], tiles = 'CartoDB dark_matter', zoom_start = 1.5)
markers = []
title = '''<h1 align = "center" style = "font-size: 35px"><b>Top 10 Tweet Locations</b></h1>'''
for i, r in new_train.iterrows():
    loss = r['count']
    if r['count'] > 0:
        counts = r['count'] * 0.4
        folium.CircleMarker([float(r['latitude']), float(r['longitude'])], radius = float(counts), color = custom_colors[1], fill = True).add_to(map)
map.get_root().html.add_child(folium.Element(title))
map

**PREPROCESSING OF DATA**

In [None]:
#preprocessing the text by removing emojis,symbols,map symbols and flags
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emotions
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

train['text_clean'] = train['text'].apply(lambda x: remove_URL(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_emoji(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_html(x))
train['text_clean'] = train['text_clean'].apply(lambda x: remove_punct(x))

In [None]:
#Removing unecessary characters
def RemoveUnneccasaryChar(sentence):
    for sentence1 in sentence:
        sentence1 = str(sentence1)
        sentence1 = sentence1.lower()
        formatted_sent = re.sub(r'https?:\/\/.*[\r\n]*','',sentence1) #  Remove hyperlinks
        formatted_sent = formatted_sent.replace('{html}',"")
        formatted_sent = re.sub(r'#','',formatted_sent) # Removed Hashtags
        formatted_sent = re.sub(r'[0-9]','',formatted_sent) # Removes Numbers
        formatted_sent = re.sub(r'@[A-Za-z]*','',formatted_sent) # Removed @ Tags
        
        sent.append(formatted_sent)

In [None]:
#tokenizing the sentence
def TokenizeSentence(sentence):
    tokenizer = TweetTokenizer(preserve_case = False,strip_handles = True,reduce_len=True)
    for sentence in sent:
        tokenized_sentence = tokenizer.tokenize(sentence)
        tokenized_sent.append(tokenized_sentence)

In [None]:
#stopword sentence(same words used in sentence)
def stopwordsSentence(sent):
    for sentence in sent:
        formatted_words=[]
        for word in sentence:
            if word not in stopwords_eng and word not in string.punctuation and len(word)>2:
                formatted_words.append(word)
        formatted_sent.append(formatted_words)  
    

In [None]:
#lemmatize the sentence(group together the same word)
def lemmatizeSentence(sent):
    lemma = WordNetLemmatizer()
    for sentence in sent:
            lemma_words = []
            for word in sentence:
                lemma_word = lemma.lemmatize(word)
                lemma_words.append(lemma_word)
            lemma_sent.append(lemma_words)   

In [None]:
#final sentence
def finalSentence(sentence1):
    for sentence in sentence1:
        sent = ' '.join([str(word) for word in sentence])
        final_sentence_list.append(sent)

In [None]:
sent = []
RemoveUnneccasaryChar(train['text'])

In [None]:
sentence = sent[83]
print(sentence)

In [None]:
train.head(20)

In [None]:
stopwords_eng = stopwords.words('english')
print('English Stop Words :\n')
print(stopwords_eng)
print('\nPunctuations  :\n')
print(string.punctuation)

In [None]:
tokenized_sent = [] # Treat every words as a individual elements
TokenizeSentence(sent)

In [None]:
tokenized_sent[1]

MAKING THE SENTENCES TO FORMATTED TEXTS

In [None]:
formatted_sent = []
stopwordsSentence(tokenized_sent)

In [None]:
formatted_sent[38]

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
lemma_sent = []
lemmatizeSentence(formatted_sent)

In [None]:
lemma_sent[83]

In [None]:
final_sentence_list = []
finalSentence(lemma_sent)

In [None]:
final_sentence_list[8]

In [None]:
train['FormattedText'] = final_sentence_list
train.head()

In [None]:
train.tail(10)

NOW LET'S TRAIN THE MODEL

In [None]:
train = train.drop(['text_clean'], axis =1)

In [None]:
x_train = train['FormattedText']
y_train = train['target']

In [None]:
x_train_array = x_train.to_numpy()
y_train_array = y_train.to_numpy()

In [None]:
x_train_array

In [None]:
y_train_array

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
tf.config.run_functions_eagerly(True)

In [None]:
train1 = tf.data.Dataset.from_tensor_slices((x_train_array,y_train_array))

In [None]:
train1

In [None]:
BUFFER_SIZE = 3200
BATCH_SIZE = 130

In [None]:
train1 = train1.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [None]:
VOCAB_SIZE = 12000

**USING TEXTVECTORIZATION AND CHANGING THE TEXT IN NUMBERS**

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens = VOCAB_SIZE)
encoder.adapt(train1.map(lambda text,target: text))

In [None]:
#converting text into labels
for text,label in train1.take(1):
    print('Text: ',text.numpy())
    print('Label: ',label.numpy())

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Dropout,LSTM,SimpleRNN,Embedding,Bidirectional,LSTM,GlobalMaxPool1D
from keras.models import Sequential

In [None]:
#numerical representation of text
print('Original Text :' +str(text))
encoded_text = encoder(text).numpy()
print('Numeric Representaion :' +str(encoded_text))

**USING LSTM MODEL AND TRAINING THE MODEL**

In [None]:
#LSTM MODEL
from tensorflow.keras.layers import LSTM as lstm
model = Sequential()
model.add(encoder)
model.add(Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=16,mask_zero = True))
model.add(Bidirectional(LSTM(16,return_sequences = True)))
model.add(Dropout(0.20))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.20))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(1))
model.summary()

In [None]:
#model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss',factor=0.25,patience=2,min_lr=0.001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])


In [None]:
history = model.fit(train1,epochs = 10,batch_size = 32)

In [None]:
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
display(Image.open('model.png'))

**PREPROCESSING THE TEST DATA**

In [None]:
sent = []
RemoveUnneccasaryChar(test['text'])
tokenized_sent = []
TokenizeSentence(sent)
formatted_sent = []
stopwordsSentence(tokenized_sent)
lemma_sent = []
lemmatizeSentence(formatted_sent)
final_sentence_list = []
finalSentence(lemma_sent)
test['text'] = final_sentence_list
test

In [None]:
x_test = test['text']
x_test_array = x_test.to_numpy()

**PREDICTING THE MODEL**

In [None]:
probs = model.predict(test["text"]) 
threshold = 0.4
pred = np.where(probs[:,] > threshold, 1, 0)
print(pred)