In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Convolution1D,Dropout,MaxPooling1D,Conv1D,Input,Flatten
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
import os


## Loading & Exploring dataset

In [None]:
tweet= pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')



In [None]:
print('training data shape', tweet.shape)
print('test data shape', test.shape)

In [None]:
def explore_data(df):
    
    '''Input- df= pandas dataframes to be explored
       Output- print shape, info and first 5 records of the dataframe 
    '''
    
    print("-"*50)
    print('Shape of the dataframe:',df.shape)
    print("Number of records in train data set:",df.shape[0])
    print("Information of the dataset:")
    df.info()
    print("-"*50)
    print("First 5 records of the dataset:")
    return df.head()
    print("-"*50)

In [None]:
# Lets use explore_data() function to explore train data
explore_data(tweet)

In [None]:
# Lets use explore_data() function to explore test data
explore_data(test)

### Class Distribution

In [None]:

feature='target'
sns.countplot(feature, data=tweet)
print('Target of 0 is {} % of total'.format(round(tweet[feature].value_counts()[0]/len(tweet[feature])*100)))
print('Target of 1 is {} % of total'.format(round(tweet[feature].value_counts()[1]/len(tweet[feature])*100)))


## Data Pre-processing

Before starting any NLP project, text data needs to be pre-processed to convert it into in a consistent format.Text will be cleaned, tokneized and converted into a matrix.

Some of the basic text pre-processing techniques includes:
1. **Make text all lower or uppercase**
2. **Removing Noise** - Remove Punctuation and numerical Values
3. **Tokenization**  - Process of converting the normal text strings into a list of tokens i.e. words.
4. **Stopword Removal**-Some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely.
5. **Stemming**-Stemming is the process of reducing inflected (or sometimes derived) words to their stem, base or root form — generally a written word form. Example if we were to stem the following words: “Stems”, “Stemming”, “Stemmed”, “and Stemtization”, the result would be a single word “stem”.
6. **Lemmatization**-A slight variant of stemming is lemmatization. The major difference between these is, that, stemming can often create non-existent words, whereas lemmas are actual words. So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma. Examples of Lemmatization are that “run” is a base form for words like “running” or “ran” or that the word “better” and “good” are in the same lemma so they are considered the same.




### Data Cleaning

In [None]:
#Create a function to clean the text
def clean_text(text):

    '''
    Input- 'text' to be cleaned
       
       Output- Convert input 'text' to lowercase,remove square brackets,links,punctuation
       and words containing numbers. Return clean text.
    
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
tweet_df1=tweet.copy()
test_df1=test.copy()
tweet_df1['text'] = tweet_df1['text'].apply(lambda x: clean_text(x))
test_df1['text'] = test_df1['text'].apply(lambda x: clean_text(x))

In [None]:
#Lets look at cleaned data
def text_after_preprocess(before_text,after_text):
    
    '''
    Input- before_text=text column before cleanup
              after_text= text column after cleanup
       Output- print before and after text to compare how it looks after cleanup
       
    '''
    print('-'*60)
    print('Text before cleanup')
    print('-'*60)
    print(before_text.head(5))
    print('-'*60)
    print('Text after cleanup')
    print('-'*60)
    print(after_text.head(5))

In [None]:
text_after_preprocess(tweet.text,tweet_df1.text)


In [None]:
text_after_preprocess(test.text,test_df1.text)


### Tokenization

In [None]:
import nltk
# Example how tokenization of text works
text = "Heard about #earthquake is different cities, stay safe everyone."
tokenizer1 = nltk.tokenize.WhitespaceTokenizer()
tokenizer2 = nltk.tokenize.TreebankWordTokenizer()
tokenizer3 = nltk.tokenize.WordPunctTokenizer()
tokenizer4 = nltk.tokenize.RegexpTokenizer(r'\w+')
print("-"*100)
print("Example Text: ",text)
print("-"*100)
print("Tokenization by whitespace:- ",tokenizer1.tokenize(text))
print("Tokenization by words using Treebank Word Tokenizer:- ",tokenizer2.tokenize(text))
print("Tokenization by punctuation:- ",tokenizer3.tokenize(text))
print("Tokenization by regular expression:- ",tokenizer4.tokenize(text))

In [None]:
#before tokenization
tweet_df1.head()

In [None]:
# Lets Tokenize the training and the test dataset copies with RegEx tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tweet_df1['text'] = tweet_df1['text'].apply(lambda x: tokenizer.tokenize(x))
test_df1['text'] = test_df1['text'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
#lets check tokenized text
tweet_df1['text']

### Stop words Removal

In [None]:
#Create a funtion to remove stopwords
def remove_stopwords(text):
    
    """
    Input- text=text from which english stopwprds will be removed
    Output- return text without english stopwords 
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words



In [None]:
#Before stopwords removal
tweet_df1.head()

In [None]:
tweet_df1['text'] = tweet_df1['text'].apply(lambda x : remove_stopwords(x))
test_df1['text'] = test_df1['text'].apply(lambda x : remove_stopwords(x))

In [None]:
#after stopwords removal
tweet_df1.head()

## Stemming and Lemmatization

Stemming and lemmatization sometimes doesnt necessarily improve results as sometimes we dont want to trim words rather preserve their original form.Its usage from problem to problem and for this problem it wouldnt be good idea to use it.

In [None]:
# Stemming and Lemmatization examples
text =  [ 'deduced', 'dogs', 'talking', 'studies']
def Stemming_Lemmatizing(text):
    # Lemmatizer
    lemmatizer=nltk.stem.WordNetLemmatizer()
    words=[lemmatizer.lemmatize(token) for token in text]
    
    # Stemmer
    stemmer = nltk.stem.PorterStemmer()
    words=[stemmer.stem(token) for token in text]

    return words

In [None]:
Stemming_Lemmatizing(text)

In [None]:
#tweet_df1['text'] = tweet_df1['text'].apply(lambda x : Stemming_Lemmatizing(x))
#test_df1['text'] = test_df1['text'].apply(lambda x : Stemming_Lemmatizing(x))

### Convert the text list into string

In [None]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = " " 
    
    # return string   
    return (str1.join(s)) 
        
        
# Driver code     
s = ['Geeks', 'for', 'Geeks'] 
print(listToString(s))  

In [None]:
#tweet_df1['text'] = tweet_df1['text'].apply(lambda x : listToString(x))
#test_df1['text'] = test_df1['text'].apply(lambda x : listToString(x))

### Spell Checker

In [None]:
!pip install pyspellchecker


In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

In [None]:
tweet_df1['text']

In [None]:
df=pd.concat([tweet_df1,test_df1])
df.shape

## GloVe for Vectorization

In [None]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in tweet if((word.isalpha()==1) )]
        corpus.append(words)
    return corpus

In [None]:
corpus=create_corpus(df)


In [None]:
corpus

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
tweet_pad.shape

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

## Create an Embedding Matrix

In [None]:
num_words=len(word_index)+1

embedding_matrix=np.zeros((num_words,200))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

### Initiating TPU 

In [None]:
import tensorflow as tf
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Creating a Baseline Model

In [None]:
with tpu_strategy.scope():
    model=Sequential()

    embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

    model.add(embedding)
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))


    optimzer=Adam(learning_rate=1e-5)

    model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [None]:
model.summary()


In [None]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

## Split  training and validation set

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=16,epochs=15,validation_data=(X_test,y_test),verbose=True)

In [None]:
#Creating a callback Function
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

callbacks = [
             ReduceLROnPlateau(monitor='val_accuracy', 
                               factor=0.2, 
                               patience=3, 
                               verbose=1)]

filter_length1 = 3
filter_length2 = 5
dropout=0.5
nb_filter = 64
learning_rate=3e-3


## Adding a Cnn layer

In [None]:
with tpu_strategy.scope():
    model=Sequential()

    embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

    model.add(embedding)
    model.add(SpatialDropout1D(0.2))
   
    model.add(Conv1D(64, 5,padding = 'same', activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))




    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate),metrics=['accuracy'])
    

In [None]:
history=model.fit(X_train,y_train,batch_size=8,epochs=30,validation_data=(X_test,y_test),verbose=True,callbacks = callbacks)


In [None]:
import keras
model1 = keras.models.Sequential([
    keras.layers.Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LEN,trainable=False),
    keras.layers.LSTM(100,return_sequences=True),
    keras.layers.LSTM(200),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1,activation='sigmoid')
])


In [None]:
model1.summary()


In [None]:
model1.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy'],
)


In [None]:
history1 = model1.fit(X_train,y_train,
                    batch_size=64,
                    epochs=10,
                    validation_split=0.2
)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(history1.history['accuracy'], label='train')
plt.plot(history1.history['val_accuracy'], label='test')
plt.legend()
plt.grid()
plt.show()

In [None]:
model2 = keras.models.Sequential([
    keras.layers.Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LEN,trainable=False),
    keras.layers.GRU(100,return_sequences=True),
    keras.layers.GRU(200),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
model2.summary()


In [None]:
model2.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy'],
)


In [None]:
history2 = model2.fit(X_train,y_train,
                    batch_size=64,
                    epochs=10,
                    validation_split=0.2
)

In [None]:
model3 = keras.models.Sequential([
    keras.layers.Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),input_length=MAX_LEN,trainable=False),
    keras.layers.Bidirectional(keras.layers.LSTM(100,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(200)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1,activation='sigmoid')
])



In [None]:
model3.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy'],
)


In [None]:
history3 = model3.fit(X_train,y_train,
                    batch_size=64,
                    epochs=10,
                    validation_split=0.2
)

## Making Submission

In [None]:
sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')


In [None]:
y_pre=model1.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub1=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub1.to_csv('submission1.csv',index=False)

In [None]:
sub.head()


In [None]:
y_pre=model3.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub3=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub3.to_csv('submission3.csv',index=False)