# **Import Required packages**

In [None]:
# for load the data
import pandas as pd
import numpy as np 

#for data visualisation
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS

#for text cleaning
import re
import string
from nltk.corpus import stopwords,wordnet
#from nltk.stem import PorterStemmer
import nltk 
from nltk.stem import WordNetLemmatizer
#for tokenization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#for split the data into train & test
from sklearn.model_selection import train_test_split

#for model building
import tensorflow as tf
import keras 
from keras.models import Sequential
from keras.layers import LSTM,Embedding,Dense, Flatten, Dropout , Bidirectional , Dropout,GlobalAveragePooling1D

#for model evaluation
from sklearn.metrics import classification_report,confusion_matrix

# **Import Data**

In [None]:
data_train = pd.read_csv("../input/nlp-getting-started/train.csv",usecols=['id','text','target'])
data_test =  pd.read_csv("../input/nlp-getting-started/test.csv",usecols=['id','text'])

In [None]:
data_train.head()

In [None]:
data_test.head()

# **EDA & Text Cleaning**

### **EDA**

In [None]:
#print the shape of train and test data
print('shape train dataframe:',data_train.shape)
print('shape test dataframe:',data_test.shape)

In [None]:
# print sum of null values for train and test dataframe
print("null values for train data : ")
print(data_train.isna().sum())
print("null values for test data : ")
print(data_test.isna().sum())

In [None]:
s=data_train.target.value_counts()
print(s)
print('0 :',round(s[0]/len(data_train)*100,2),'%')
print('1 :',round(s[1]/len(data_train)*100,2),'%')
sns.countplot(data_train['target'])

In [None]:
#define new column that contain the length of each text in the dataframe
data_train["text_length"] = data_train['text'].apply(lambda x:len(x.split()))
data_test["text_length"] = data_test['text'].apply(lambda x:len(x.split()))

In [None]:
print('the max length tweet for train data is:',data_train['text_length'].max()) 
print('the min length tweet for train data is:',data_train['text_length'].min())
print('the max length tweet for test data is:',data_test['text_length'].max()) 
print('the min length tweet for test data is:',data_test['text_length'].min())

In [None]:
data_train['text_length'].plot.hist()

In [None]:
print(len(data_train[data_train.text_length<4]))
print(len(data_train[data_train.text_length>25]))

In [None]:
# delete all tweets that have Nbr words less then 4 and more then 25
data_train = data_train[~(data_train.text_length<4)]
data_train = data_train[~(data_train.text_length>25)]

In [None]:
print('the max length tweet is:',data_train['text_length'].max()) 
print('the min length tweet is:',data_train['text_length'].min())

In [None]:
data_train['text_length'].plot.hist()

### **Text cleaning**

In [None]:
def clean_text(data):
    # convert catacter to lowercase
    data['clean_text']=data['text'].str.lower()
    #remove URLS
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"http\S+", "", elem))
    #remove ponctuation
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"[^\w\s]", "", elem))
    #remove 
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'/n',"",elem))
    #remove degits
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\d+',"",elem))
    #remove multiple spaces
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+'," ",elem))
    #remove single caracter
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+[a-zA-Z]\s+'," ",elem))
    return data

In [None]:
data_train=clean_text(data_train)
#data_test=clean_text(data_test)

### **Remove StopWords**

In [None]:
def remove_Stopwords(text):
    stopW=stopwords.words('english') #get the english stopwords
    return " ".join([i for i in text.split() if i not in stopW])

data_train['clean_text']=data_train['clean_text'].apply(lambda x:remove_Stopwords(x))

In [None]:
#data_test['clean_text']=data_test['clean_text'].apply(lambda x:remove_Stopwords(x))

### **Text Normalization (Lemmetization)**

In [None]:
 def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(text):
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # 2. Lemmatize text with the appropriate POS tag
    return " ".join([lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in text.split()])


#Apply lemmatizer to each row in the dataframe
data_train['clean_text'] = data_train['clean_text'].apply(lambda x:lemmatize(x) )
#data_test['clean_text'] = data_test['clean_text'].apply(lambda x:lemmatize(x) )

In [None]:
#display exemple of data before and after cleaning
print(data_train['text'][2])
print(data_train['clean_text'][2])

### **Word Cloud**

In [None]:
text = list(data_train[data_train["target"] == 0].clean_text.values)
wordcloud = WordCloud(stopwords=STOPWORDS).generate(str(text))
plt.figure(figsize = (15, 7))
plt.imshow(wordcloud)
plt.axis("off")
plt.title('Wordcloud for normal tweets')
plt.show()

In [None]:
text = list(data_train[data_train["target"] == 1].clean_text.values)
wordcloud = WordCloud(stopwords=STOPWORDS).generate(str(text))
plt.figure(figsize = (15, 7))
plt.imshow(wordcloud)
plt.axis("off")
plt.title('Wordcloud for disaster tweets')
plt.show()

# **Tokenazation**

In [None]:
max_fatures = 100000
#Tensorflow Tokenizer
tokenizer = Tokenizer(num_words=max_fatures,split=' ')
tokenizer.fit_on_texts(data_train['clean_text'].values)
X = tokenizer.texts_to_sequences(data_train['clean_text'].values)
X = pad_sequences(X,maxlen=25,padding='post')

In [None]:
print(X[5]) 
print(' '.join([tokenizer.index_word[i] for i in X[5] if i!=0]))

# **Split Data to train & Test**

In [None]:
Y = data_train['target'].values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

# **Build the model**

### **simple model**

In [None]:
embed_dim = 100
vocab_size = len(tokenizer.index_word)+1
model1 = Sequential()

## embedding layer
model1.add(Embedding(vocab_size,embed_dim,input_length=25))
#model1.add(Dropout(0.4))
## Three Bilstm layers
model1.add(LSTM(16))
#model1.add(Dropout(0.4))
model1.add(Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.Adam()
model1.compile(loss="BinaryCrossentropy", optimizer=opt,metrics = ['accuracy'])
model1.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model1, show_shapes = True)

In [None]:
keras.backend.clear_session()
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode = 'min',patience=3,verbose=1)

In [None]:
keras.backend.clear_session()
batch_size = 32
history1 = model1.fit(X_train, Y_train, epochs = 10,batch_size=batch_size,validation_data=(X_test,Y_test), verbose = 1,callbacks=[early_stop])

### **Wordembedding glov**

In [None]:
import requests, zipfile, io
zip_file_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [None]:

import codecs
embeddings_index = {}
f = codecs.open("./wiki-news-300d-1M.vec", encoding="utf-8")
# for Glove
# f = codecs.open(‘glove.840B.300d.txt’, encoding=’utf-8')
for line in f:
    values = line.rstrip().rsplit(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [None]:
words_not_found = []
embed_dim = 300
vocab_size = len(tokenizer.index_word)+1
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in tokenizer.word_index.items():
    
    if i >= vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
  
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
model2 = tf.keras.Sequential()
model2.add(Embedding(vocab_size, embed_dim, input_length=25, weights=[embedding_matrix],trainable=False))
#model2.add(Bidirectional(LSTM(32, return_sequences= True)))
model2.add(Bidirectional(LSTM(16,dropout = 0.2, recurrent_dropout = 0.2)))
#model2.add(Dense(16,activation='relu'))
model2.add(Dropout(0.3))
model2.add(Dense(1,activation='sigmoid'))
model2.summary()

In [None]:
opt = tf.keras.optimizers.Adam()
model2.compile(loss="BinaryCrossentropy", optimizer=opt,metrics = ['accuracy'])
keras.backend.clear_session()
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode = 'min',patience=3,verbose=1)

In [None]:
keras.backend.clear_session()
batch_size = 64
history2 = model2.fit(X_train, Y_train, epochs = 20,batch_size=batch_size,validation_data=(X_test,Y_test), verbose = 1,callbacks=[early_stop])

# **Evaluate the model**

In [None]:
plt.plot(history2.history['accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### **Test Score**

In [None]:
scores = model2.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', round(scores[0],2))
print('Test accuracy:', round(scores[1],2))

### **Confusion Matrix & Classification Report**

In [None]:
pred= model2.predict(X_test)
pred= [1 if i >0.5 else 0 for i in pred]
len(pred)

In [None]:
conf = confusion_matrix(Y_test,pred)
fig, ax = plt.subplots()
# create heatmap
sns.heatmap(pd.DataFrame(conf), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(classification_report(Y_test,pred))

# **Make Prediction**

In [None]:
data_sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
data_sample.sample(3)

In [None]:
# tokenize and convert to sequence the text to predict
tokenizer.fit_on_texts(data_test['text'].values)
X1 = tokenizer.texts_to_sequences(data_test['text'].values)
X1 = pad_sequences(X1,maxlen=31,padding='post')

In [None]:
pred = model1.predict(X1)
pred_F = np.where(pred>0.5,1,0)

In [None]:
pred_F.shape

In [None]:
pred_F = pred_F.reshape((1,3263))[0]

In [None]:
pred_F.shape

In [None]:
data_sample['target']=pred_F

In [None]:
data_sample.to_csv('submission2.csv',index = False)

In [None]:
df = pd.read_csv('submission2.csv')
df.target.value_counts()