### Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


In [None]:
from sklearn import linear_model, model_selection
from sklearn.model_selection import train_test_split

In [None]:
import re
import os
from tqdm import tqdm
# import gensim
import string
from collections import defaultdict
from collections import  Counter

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

from keras.initializers import Constant


In [None]:
from tensorflow.keras.optimizers import SGD

In [None]:
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

In [None]:
import random
random.seed(31415)
np.random.seed(31415)
import warnings
warnings.filterwarnings('ignore')

### Basic EDA and preprocess

In [None]:
tweets_train = pd.read_csv('data/train.csv')
tweets_test = pd.read_csv('data/test.csv')
tweets_train.head(3)

In [None]:
print('There are {} rows and {} columns in train'.format(tweets_train.shape[0],tweets_train.shape[1]))
print('There are {} rows and {} columns in train'.format(tweets_test.shape[0],tweets_test.shape[1]))

In [None]:
x=tweets_train.target.value_counts()
sns.barplot(x.index,x)
plt.gca().set_ylabel('samples')

#### Cleaning Tweets using "tweet-preprocessor" 

In [None]:

### Install package for first time 
# ! pip install tweet-preprocessor


In [None]:
### Preprocess text
import preprocessor as p


In [None]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)


In [None]:
tweets_train["clean_text"] = tweets_train["text"].apply(p.clean)

In [None]:
tweets_train

In [None]:
tweets_test["clean_text"] = tweets_test["text"].apply(p.clean)

In [None]:
tweets_test

In [None]:
def preprocess_text(text):
    
    # remove numbers
    text = re.sub(r'[0-9]+', '', text)
    
    ## Remove URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    text1 = url.sub(r'',text)
    
    ## Remove HTML tags 
    html = re.compile(r'<.*?>')
    text2 = html.sub(r'',text1)
    
    ## Remove Emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text2 = emoji_pattern.sub(r'', text2)

    ## Remove punctuation
    
    table= str.maketrans('','',string.punctuation)
    text3 = text2.translate(table)
    
    
    
    return text3

In [None]:
tweets_train["clean_text"] = tweets_train["clean_text"].apply(preprocess_text)

In [None]:
tweets_train

#### Split the data for trainingg and validation sets

In [None]:

train_sentences, val_sentences, train_labels, val_labels = train_test_split(tweets_train["clean_text"], tweets_train["target"], 
                                                                              test_size=0.2, random_state=10)



In [None]:
# vocab_size = 1000

# embedding_dim = 16

trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# training_portion = 

In [None]:
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
max_sequence_len = max([len(x) for x in train_sequences])
# max_length = 120
max_length = max_sequence_len

In [None]:


train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length ,truncating = trunc_type)


In [None]:
print(len(word_index))

In [None]:
val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(train_padded[0]))

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O data/glove.6B.100d.txt

In [None]:

embeddings_index = {};

vocab_size = len(word_index)

embedding_dim = 100

with open('embeddings/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
embeddings_matrix.shape

## Tensorflow model

In [None]:
regularization_strength = 0.01

In [None]:
model=Sequential()

embedding=Embedding(vocab_size+1, embedding_dim, weights = [embeddings_matrix] ,
                input_length = max_length, trainable=False)

model.add(embedding)
# model.add(tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length))
model.add(SpatialDropout1D(0.2))

# model.add(tf.keras.layers.Conv1D(64, 5, activation='relu' , padding = 'valid'))
# model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
# model.add(tf.keras.layers.GlobalAveragePooling1D())

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
# model.add(tf.keras.layers.LSTM(64, dropout = 0.2, recurrent_dropout = 0.2 ))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(regularization_strength)))
model.add(Dense(1, activation='sigmoid'))

In [None]:
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


In [None]:
adam = Adam(lr = 0.01)
sgd = SGD(lr = 0.001)
model.compile(loss='binary_crossentropy',optimizer = sgd, metrics=['accuracy', f1_metric])
model.summary()

In [None]:
num_epochs = 15
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(val_padded, val_labels), verbose=2)



In [None]:
model_history = pd.DataFrame(model.history.history)
model_history.head()

#### Plot history of training

### Submit results

In [None]:
test_sequences = tokenizer.texts_to_sequences(tweets_test.text)
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [None]:
probabilities = model.predict(test_padded)

In [None]:
predictions = (probabilities > 0.5).astype(int)
predictions = np.ndarray.flatten(predictions)
pd.value_counts(predictions)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = predictions
sample_submission.to_csv("submission.csv", index=False)

In [None]:
# sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [None]:
sample_submission["target"] = clf.predict(test_vectors)


In [None]:
sample_submission

In [None]:
sample_submission.to_csv("submission.csv", index=False)