In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
train.shape

In [None]:
from nltk.corpus import stopwords
import nltk, os, re, string
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

train['text']=train['text'].apply(remove_stopwords)
test['text']=test['text'].apply(remove_stopwords)

In [None]:
test.head()

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()
def process_text(text):
    text = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])", " ",text.lower())
    words = nltk.word_tokenize(text)
    words = [lemma.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    text = " ".join(words)
        
    return text

train["text"] = train["text"].apply(process_text)
test["text"] = test["text"].apply(process_text)

In [None]:
import emoji

def cleanTweet(txt):
    txt = re.sub(r'#','',txt)
    txt = re.sub(r'RT : ','',txt)
    txt = re.sub(r'\n','',txt)
    # to remove emojis
    txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+','',txt)
    txt = re.sub(r"https?://\S+|www\.\S+","",txt)
    txt = re.sub(r"<.*?>","",txt)
    return txt  

In [None]:
train["text"] = train["text"].apply(cleanTweet)
test["text"] = test["text"].apply(cleanTweet)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.target.value_counts()

In [None]:
train["length"] = train["text"].apply(len)

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
sns.histplot(train[train["target"] == 1]["length"],bins = 30,ax = ax1, kde=True).set(title = "disaster tweets")
sns.histplot(train[train["target"] == 0]["length"],bins = 30,ax = ax2, kde = True).set(title = "Not disaster tweets")
plt.show()

In [None]:
train_disaster = train[train["target"] == 1]
train_not_disaster  = train[train["target"] == 0]

In [None]:
train_disaster.length.describe()

In [None]:
train_not_disaster.length.describe()

In [None]:
path = '/kaggle/input/'
EMBEDDING_FILE=f'{path}glove6b50dtxt/glove.6B.50d.txt'

# configuration setting
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 64
EPOCHS = 5

In [None]:
from sklearn.model_selection import train_test_split
X, y = train['text'], train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(EMBEDDING_FILE) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

In [None]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(list(X_train))

# pad sequences so that we get a N x T matrix
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train.shape)

In [None]:
X_test = tokenizer.texts_to_sequences(list(X_test))

# pad sequences so that we get a N x T matrix
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_test.shape)

In [None]:
test = tokenizer.texts_to_sequences(list(test['text']))

# pad sequences so that we get a N x T matrix
test = pad_sequences(test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test.shape)

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [None]:
print('Building model...')

# create an LSTM network with a single LSTM
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
# x = LSTM(15, return_sequences=True)(x)
x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(1, activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)
model.summary()

In [None]:
print('Training model...')
r = model.fit(
  X_train,
  y_train,
  batch_size=BATCH_SIZE,
  epochs=50,
  validation_split=VALIDATION_SPLIT
)

In [None]:
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
# Plotting accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(X_train,y_train)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100)

In [None]:
pred = model.predict(X_test)
pred[:5]

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

cm = confusion_matrix(y_test,pred.round())
cm = pd.DataFrame(cm , index = ['Not_disaster','Disaster'] , columns = ['Not_disaster','Disaster'])
plt.figure(figsize = (6,4))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Not_disaster','Disaster'] , yticklabels = ['Not_disaster','Disaster'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(classification_report(y_test,pred.round()))

In [None]:
predictions_test = model.predict(test)
predictions_test[:5]

In [None]:
submission_sample = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission_sample.head()
submission_sample.shape

In [None]:
predicted_value = (predictions_test > 0.5).astype(int)
submission_sample.target = predicted_value
submission_sample.head()

In [None]:
submission_sample.to_csv('submission_3.csv',index=False)