In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)
print(submission.shape)

In [None]:
display(train.head)
display(test.head)
display(submission.head)

### Cleaning Part

In [None]:
## remove #tags,@words and links from the text field
import re,string

def strip_links(text):
    link_text = re.sub('http://\S+|https://\S+', '', text)
    return link_text
    

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

### Remove hyperlinks

In [None]:
train['text_1']=train['text'].apply(lambda x:strip_links(x))
test['text_1']=test['text'].apply(lambda x:strip_links(x))

### Remove hashtags and mentions if any

In [None]:
train['text_2']=train['text_1'].apply(lambda x:strip_all_entities(x))
test['text_2']=test['text_1'].apply(lambda x:strip_all_entities(x))

In [None]:
print(test['text'][3260])
print(test['text_1'][3260])
print(test['text_2'][3260])

### Remove special characters

In [None]:
train['text_2'] = train['text_2'].apply(lambda x: x.lower())
train['text_3'] = train['text_2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [None]:
test['text_2'] = test['text_2'].apply(lambda x: x.lower())
test['text_3'] = test['text_2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [None]:
print(test['text'][3260])
print(test['text_3'][3260])

### Check the number of 0's and 1's in tweets

In [None]:
print(train[ train['target'] == 1].size)
print(train[ train['target'] == 0].size)

### Check the length of tokens

In [None]:
## For train
l=[]
for i in range(len(train)):
    l.append(len([w for w in train.loc[i,'text_3'].split(' ')]))

In [None]:
## For test
l1=[]
for i in range(len(test)):
    l1.append(len([w for w in test.loc[i,'text_3'].split(' ')]))

In [None]:
### Add a column of length of tokens
train['token_cnt']=l
test['token_cnt']=l1

In [None]:
train['token_cnt'].max()## Maximumm tokens in train set =34
test['token_cnt'].max()## Maximumm tokens in train set =32

### Drop unnecessary columns

In [None]:
train.drop(['keyword','location','text','text_1','text_2','token_cnt'],axis=1,inplace=True)

In [None]:
test.drop(['keyword','location','text','text_1','text_2','token_cnt'],axis=1,inplace=True)

### Split the data of train into trainn and validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train['text_3'],
    train['target'],
    test_size=0.2, 
    random_state=123
)

### Build the Tokenizer

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [None]:
### Declare the vocabulary size for word embedding
top_words = 1000
t = Tokenizer(num_words=top_words) # num_words -> Vocablury size
t.fit_on_texts(X_train.tolist())

### Generate the word index for train and test

In [None]:
X_train = t.texts_to_sequences(X_train.tolist())
X_val = t.texts_to_sequences(X_val.tolist())

### Pad the sequence of tokens

In [None]:
from tensorflow.python.keras.preprocessing import sequence
max_review_length = 50

In [None]:
X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,padding='post')
X_val = sequence.pad_sequences(X_val, maxlen=max_review_length, padding='post')

### Load pretrained embedding model from gensim

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
#Check if embeddings have been downloaded
!ls -l

In [None]:
#unzip the file, we get multiple embedding files. We can use either one of them
!unzip glove.6B.zip

In [None]:
!ls -l

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
#Glove file - we are using model with 50 embedding size
glove_input_file = 'glove.6B.50d.txt'

#Name for word2vec file
word2vec_output_file = 'glove.6B.50d.txt.word2vec'

#Convert Glove embeddings to Word2Vec embeddings
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
!ls -l

### Get embedding from loaded pretrained model 

In [None]:
### We will extract word embedding for which we are interested in; the pre trained has 400k words each with 50 embedding vector size.
from gensim.models import Word2Vec, KeyedVectors

In [None]:
# Load pretrained Glove model (in word2vec form)
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
#Embedding length based on selected model - we are using 50d here.
embedding_vector_length = 50

In [None]:
#Initialize embedding matrix
embedding_matrix = np.zeros((top_words + 1, embedding_vector_length))
print(embedding_matrix.shape)

In [None]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > (top_words+1):
        break
    try:
        embedding_vector = glove_model[word] #Reading word's embedding from Glove model for a given word
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [None]:
embedding_matrix[3]

### Intitate the model

In [None]:
#Initialize model
import tensorflow as tf
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [None]:
model.add(tf.keras.layers.Embedding( top_words+ 1, #Vocablury size
                                    embedding_vector_length, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=max_review_length) #Number of words in each review
         )

In [None]:
model.output

### Add LSTM layers with 256 cell and hidden state size

In [None]:
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.LSTM(256)) #RNN State - size of cell state and hidden state
model.add(tf.keras.layers.Dropout(0.2))

In [None]:
model.output

In [None]:
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

### Train the Model

In [None]:
model.fit(X_train,y_train,
          epochs=10,
          batch_size=32,          
          validation_data=(X_val, y_val))

### Store the output of lstm

In [None]:
x = model.get_layer('lstm').output
model2 = tf.keras.Model(model.input, x)

### Train the model on the whole train set

In [None]:
### Declare the vocabulary size for word embedding
top_words = 1000
t = Tokenizer(num_words=top_words) # num_words -> Vocablury size
t.fit_on_texts(train['text_3'].tolist())
train_seq  = t.texts_to_sequences(train['text_3'].tolist())
test_seq  = t.texts_to_sequences(test['text_3'].tolist())

train_seq = sequence.pad_sequences(train_seq,maxlen=max_review_length,padding='post')
test_seq = sequence.pad_sequences(test_seq, maxlen=max_review_length, padding='post')

In [None]:
train_seq

In [None]:
model.fit(train_seq,train['target'],
          epochs=10,
          batch_size=32)

In [None]:
## train for 10 more rounds
model.fit(train_seq,train['target'],
          epochs=20,
          initial_epoch=10,
          batch_size=32)

In [None]:
test_pred=model.predict(test_seq)

In [None]:
test_pred=test_pred.reshape((3263,))

### Append to test data

In [None]:
test['prediction']=test_pred

In [None]:
test['target']=np.where(test['prediction']>0.5,1,0)

In [None]:
test['target'].value_counts()

In [None]:
test.drop(['text_3','prediction'],axis=1,inplace=True)

In [None]:
test

In [None]:
test.to_csv('sample_submission.csv',index=False)