### Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import string
import tqdm
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization,SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import Adam

### Load data

In [None]:
train= pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
print('There are {} rows and {} columns in train set'.format(tweet.shape[0],tweet.shape[1]))
print('There are {} rows and {} columns in test et'.format(test.shape[0],test.shape[1]))

In [None]:
train.head(10)

In [None]:
missing_val = pd.DataFrame(train.isnull().sum())
missing_val = missing_val.reset_index()
missing_val

In [None]:
train.info()

In [None]:
train.describe().T

### Data cleaning and preprocessing

Before doing the data analysis, it would be better if we could clean-up our data to remove html tags, emojis and other punctuations

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

train['clean_text'] = train['text'].apply(lambda x: remove_URL(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_emoji(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_html(x))
train['clean_text'] = train['clean_text'].apply(lambda x: remove_punct(x))

In [None]:
# Tokenizing the cleaned texts.

train['tokenized'] = train['clean_text'].apply(word_tokenize)


In [None]:
train.head()

In [None]:
train['lower'] = train['tokenized'].apply(
    lambda x: [word.lower() for word in x])

train['no_stopwords'] = train['lower'].apply(
    lambda x: [word for word in x if word not in set(nltk.corpus.stopwords.words('english'))])



In [None]:
train['no_stopwords'] = [' '.join(map(str, l)) for l in train['no_stopwords']]

In [None]:
train.head()

In [None]:
test['clean_text'] = test['text'].apply(lambda x: remove_URL(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_emoji(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_html(x))
test['clean_text'] = test['clean_text'].apply(lambda x: remove_punct(x))

test['tokenized'] = test['clean_text'].apply(word_tokenize)

test['lower'] = test['tokenized'].apply(
    lambda x: [word.lower() for word in x])

test['no_stopwords'] = test['lower'].apply(
    lambda x: [word for word in x if word not in set(nltk.corpus.stopwords.words('english'))])

test['no_stopwords'] = [' '.join(map(str, l)) for l in test['no_stopwords']]

In [None]:
combined = train['no_stopwords'].tolist() + test['no_stopwords'].tolist()

In [None]:
len(combined)

## Exploratory Data Analysis

In [None]:
# Target distribution.

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 6), dpi=100)
sns.countplot(train['target'], ax=axes[0])
axes[1].pie(train['target'].value_counts(),
            labels=['Not Disaster', 'Disaster'],
            autopct='%1.2f%%',
            shadow=True,
            explode=(0.05, 0),
            startangle=60)
fig.suptitle('Distribution of the Tweets', fontsize=24)
plt.show()

### Word cloud for all disaster tweets

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.no_stopwords[train['target']==1])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

### Word cloud for all disaster tweets

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.no_stopwords[train['target']==0])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

### Words in a processed tweet

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=train[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len,color='blue')
ax1.set_title('disaster tweets')
tweet_len=train[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len,color='yellow')
ax2.set_title('Non disaster tweets')
fig.suptitle('Words in a processed tweet')
plt.show()

### Average number of words in a processed tweet

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=train[train['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='purple')
ax1.set_title('disaster tweets')
word=train[train['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='orange')
ax2.set_title('Non disaster tweets')
fig.suptitle('Average word length in each processed tweet')

### Creating embedding of our tweets using GloVe embeddings

In [None]:
# Load pretrained GloVe embeddings

embeddings_index = dict()
f = open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
#Creating Embeddings for the tweets

max_len_tweet = 50

tok = Tokenizer()
tok.fit_on_texts(combined)
vocab_size = len(tok.word_index) + 1
encoded_tweet = tok.texts_to_sequences(combined)
padded_tweet = pad_sequences(encoded_tweet, maxlen=max_len_tweet, padding='post')

vocab_size = len(tok.word_index) + 1

tweet_embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tok.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        tweet_embedding_matrix[i] = t_embedding_vector

### Define LSTM network

In [None]:

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len_tweet, embeddings_initializer=Constant(tweet_embedding_matrix), trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.2))
model.add(BatchNormalization())
# model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
# model.add(BatchNormalization())
# model.add(LSTM(20,dropout=0.2, recurrent_dropout=0.2))
# model.add(Dropout(0.3))
#model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
optimzer=Adam(learning_rate=1e-4)
model.compile(loss='binary_crossentropy', optimizer=optimzer, metrics=['accuracy', 'mae'])

In [None]:
model.fit(padded_tweet[:7613], train['target'].values, epochs = 11)

### Test set

In [None]:
test.head()

In [None]:
test.head()

In [None]:
preds = model.predict(padded_tweet[7613:])

In [None]:
preds[:20]

In [None]:
# pred=[1 if i>0.5 else 0 for i in preds]
pred = np.round(preds).astype(int).reshape(3263)

In [None]:
pred

In [None]:
submission=pd.DataFrame()
submission['id']=test['id'].to_list()
submission['target']=pred

In [None]:
submission.head(10)

In [None]:
submission.to_csv('submission4.csv',index=False)