## Importing needed libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
import gc
import operator
import nltk

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

## Importing the dataset

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.size

In [None]:
test.size

# EDA

In [None]:
sns.countplot(train['target'])

## There are more tweets with target 0 (No disaster) than of Target 1( Disaster)

In [None]:
sns.barplot(train['keyword'].isnull().values)
print("The number of null values in keyword are", train['keyword'].isnull().sum())

In [None]:
sns.barplot(train['location'].isnull().values)
print("The number of null values in location are", train['location'].isnull().sum())

## So, around 8% of keyword values and 33% of Location values are null in train sample

In [None]:
sns.barplot(test['keyword'].isnull().values)
print("The number of null values in keyword are", test['keyword'].isnull().sum())

In [None]:
sns.barplot(test['location'].isnull().values)
print("The number of null values in keyword are", test['location'].isnull().sum())

## So, both training and test set have same ratio of missing values in keyword and location

## Filling the null values with "Unknown" for EDA purpose

In [None]:
for df in [train, test]:
    for col in ['keyword', 'location']:
        df[col] = df[col].fillna('Unknown')
train.head()

In [None]:
train['keyword'].nunique()

## Top keywords that suggest a disaster tweet

In [None]:
ag = train.groupby('keyword').agg({'text':np.size, 'target':np.mean}).rename(columns={'text':'Count', 'target':'Disaster Probability'})
ag.sort_values('Disaster Probability', ascending=False).head(20)

## Top keywords that suggest not being a disaster tweet

In [None]:
ag = train.groupby('keyword').agg({'text':np.size, 'target':np.mean}).rename(columns={'text':'Count', 'target':'Disaster Probability'})
ag.sort_values('Disaster Probability', ascending=True).head(20)

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len1=train[train['target']==1]['text'].str.len()
ax1.hist(tweet_len1,color='pink')
ax1.set_title('disaster tweets')
tweet_len2=train[train['target']==0]['text'].str.len()
ax2.hist(tweet_len2,color='blue')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets', size=20)
plt.show()

## The character distribution is almost same for both

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
ax1.hist(train['text'].apply(lambda x: len(str(x).split())), color='pink')
ax1.hist(test['text'].apply(lambda x: len(str(x).split())), color='green')
ax1.set_title('Word Count distribution in Training, Test Set')
ax2.hist(train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])), color='pink')
ax2.hist(test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])),color='green')
ax2.set_title('Stop words distribution in Training, Test Set')

## Emojis convey a lot, so we will replace them with words

In [None]:
!pip install emot

In [None]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text

train['text'] = train['text'].apply(lambda x: convert_emojis(x))
test['text'] = test['text'].apply(lambda x: convert_emojis(x))

## URL, Mention etc. play no significant rule in the sentiment of tweet, so we will remove them 

In [None]:
from nltk.corpus import stopwords
# load stop words
stop_word = stopwords.words('english')

def clean(text):
    #     remove urls
    text = re.sub(r'http\S+', " ", text)
    #     remove mentions
    text = re.sub(r'@\w+',' ',text)
    #     remove hastags
    text = re.sub(r'#\w+', ' ', text)
    #     remove digits
    text = re.sub(r'\d+', ' ', text)
    #     remove html tags
    text = re.sub('r<.*?>',' ', text) 
    #     remove stop words 
    text = text.split()
    text = " ".join([word for word in text if not word in stop_word])
        
    return text

In [None]:
train['text'] = train['text'].apply(lambda x: clean(x))
test['text'] = test['text'].apply(lambda x: clean(x))

In [None]:
train.head()

## Word Embedding using Glove

In [None]:
from tqdm import tqdm
def create_corpus(train):
    corpus=[]
    for tweet in tqdm(train['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus
        

In [None]:
corpus=create_corpus(train)

In [None]:
embedding_dict={}
with open('../input/glove6b/glove.6B.100d.txt','r', encoding='utf8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

## Basic LSTM model

In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
train1=tweet_pad[:train.shape[0]]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train1,train['target'].values,test_size=0.2)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

## If you want to score even more, change all the tweets that has keywords wreckage, debris, derailment to 1 as we saw above in EDA that they have almost 100% chance of being disaster tweet. Similarly, check for Non Disaster tweets

## Upvote if you found it helpful!