In [1]:
import numpy as np
import pandas as pd 
import re
import os
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from tensorflow.keras.initializers import Constant
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/glove6b100d-2/glove.6B.100d.txt
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


In [3]:
train_csv = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_csv = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample_submission_csv = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
train_csv.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


I think this Location column is not going to help much for model accuracy in both training or prediction.<br>
Hence i am going to drop that column.

In [4]:
train_df = train_csv
train_df = train_df.drop('location', axis = 1)
train_df.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1


Now starts helper functions for data preprocessing<br>
I have focused on the following:
1. Removing Web links
2. Removing @'s
3. Removing Hashtags
4. Rewrite words with single qoutes
5. Removing punctuations
6. Removing Stop Words
7. Applying Lemmatization

In [5]:
#re.findall(r'[\w\.-]+@[\w\.-]+', tweet) ----> email
def remove_hashtag(tweet) :
    hash_part = re.findall(r'#(\w+)',tweet)
    for index,words in enumerate(hash_part) :
        sub = re.findall(r'[A-Z][A-Z0-9]+', words)
        if sub :
            split_word = words.split(sub[0])
            string = sub[0] + " " + split_word[1]
            if split_word[1] :
                tweet = re.sub(r'#'+sub[0]+'[a-z]+',string,tweet)
            else :
                tweet = re.sub(r'#'+sub[0],string,tweet)
        elif re.findall(r'[A-Z][a-z]+',words) :
            sub = re.findall(r'[A-Z][a-z]+',words)
            string = ""
            if len(sub) !=1 :
                for sub_word in sub :
                    string = string + " "  + sub_word
                tweet = re.sub(r'#'+sub[0]+'[a-zA-Z]+', string, tweet)
            else :
                string = string + sub[0] + " "
                tweet = re.sub(r'#'+sub[0], string, tweet)
        else :
            tweet = re.sub(r'#'+hash_part[index], hash_part[index], tweet)
    return tweet

def rewrite_single_qoute(tweet) :
    tweet = re.sub(r'i\'m', 'i am', tweet)
    tweet = re.sub(r'I\'m', 'i am', tweet)
    tweet = re.sub(r'can\'t', 'can not', tweet)
    tweet = re.sub(r'omg|Omg', 'oh my god', tweet)
    #tweet = re.sub(r'', '', tweet)
    tweet = re.sub(r'(\w+)\'re', '\g<1>are', tweet)
    tweet = re.sub(r'(\w+)\'s', '\g<1>is', tweet)
    tweet = re.sub(r'dammit', 'damn it', tweet)
    tweet = re.sub(r'won\'t', 'will not', tweet)
    tweet = re.sub(r'wont', 'will not', tweet)
    tweet = re.sub(r'(\w+)\'d', '\g<1>would', tweet)
    return tweet

def remove_additional(tweet) :
    tweet = re.sub('[^a-zA-z]+',' ',tweet)
    return tweet


def data_preprocessing(tweet) :
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@(\w*)', '', tweet)
    tweet = remove_hashtag(tweet)
    tweet = re.sub(r'\n',' ', tweet) 
    tweet = re.sub('\s+', ' ', tweet).strip()
    # rewriting words containing single qoute 
    tweet = rewrite_single_qoute(tweet)
    tweet = remove_additional(tweet)#removing additional unwanted characters (and also remaining Hashtags)
    tweet = tweet.lower()
    #stop words code to be written below
    tweet = tweet.split()
    tweet = [w for w in tweet if not w in set(stopwords.words('english'))] 
    tweet = ' '.join(tweet)
    #lemmatization
    token_list = word_tokenize(tweet)
    tweet = ' '.join([lemmatizer.lemmatize(w) for w in token_list])
    return tweet

Filling null values in keyword with an empty string and appending to their respective texts(tweets)

In [6]:
train_df["keyword"] = train_df["keyword"].fillna(' ')
train_df['joined'] = train_df[['keyword', 'text']].apply(lambda x: ' '.join(x), axis = 1)
train_df.head()

Unnamed: 0,id,keyword,text,target,joined
0,1,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake...
1,4,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' ar...
3,6,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation ..."
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska a...


Preprocessing is then applied to the joined column

In [7]:
train_df['joined'] = train_df['joined'].apply(lambda x : data_preprocessing(x))
train_df.head()

Unnamed: 0,id,keyword,text,target,joined
0,1,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


Here is the set of stop words that can be removed from the tweets

In [8]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'you', 'we', 'because', "you'd", 'them', 'he', 'mustn', 'below', 'does', 'into', 'the', 'above', "hadn't", 'do', "weren't", 'if', 'up', 'most', 'having', 'her', 'until', 'weren', 's', 'where', 'him', 'hers', "mustn't", "needn't", 'once', 'me', 'being', "should've", 'wasn', "that'll", 'for', 'yourself', 'own', "wasn't", 'am', 'before', 'm', 'again', 'same', 'be', 'aren', "haven't", 'whom', 'as', 'a', 'why', 'now', 'between', 'than', 'then', "aren't", "couldn't", 'an', 'hadn', 've', "isn't", 'nor', 'has', 'under', 'needn', 'to', 'on', 'here', 'in', 'some', 'shan', 'ourselves', 'against', 'don', 'themselves', "didn't", 'shouldn', 'out', 'have', 're', 'haven', 'so', 'my', 'didn', 'down', 'too', 'few', 'been', 'this', 'o', 'his', "won't", 'each', 'they', "hasn't", 'are', 'were', 'ain', 'no', 'himself', 'all', 'of', "she's", 'is', 'during', 'couldn', "mightn't", 'very', "you're", 'such', 'both', 'which', 'ours', 'did', 'by', 'these', 'but', "don't", 'y', 'our', 'theirs', "shouldn't", "would

Finding the length of longest tweet

In [9]:
tweet_max_length = max(len(x) for x in train_df["joined"])
print(tweet_max_length)

156


Creating a corpus of words from tweets

In [10]:
def create_corpus(df):
    corpus=[]
    for tweet in df['joined'] :
        words=[word.lower() for word in word_tokenize(tweet)]
        corpus.append(words)
    return corpus  

corpus=create_corpus(train_df)

Creating an Embedding dictionary, from Standford's glove represtation.<br> I chose glove represtation with 100 dimensions.

In [11]:
embedding_dict={}
with open('/kaggle/input/glove6b100d-2/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]  # added
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

Tokenization is done on courpus and then tweets are tokenized.<br>
With the Embedding dictionary created and word_index obtained from tokenizer an Embedding Matrix is created.<br>

In [14]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

word_index=tokenizer.word_index

num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in word_index.items() :
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec  



In [12]:
#HYPER PARAMETERS
max_length = tweet_max_length
padding_type = 'post'
trunc_type = 'post'
oov_tok = '<OOV>'

In [None]:
training_tweets, validation_tweets, training_targets, validation_targets = train_test_split(train_df['joined'], 
                                                                                            train_df['target'], 
                                                                                            test_size=0.2, 
                                                                                            random_state=23)

#Sequencing and padding training data
training_sequences = tokenizer.texts_to_sequences(training_tweets)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#Sequencing and padding validation data
validation_sequences = tokenizer.texts_to_sequences(validation_tweets)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix), input_length=max_length,trainable=False),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units = 64, dropout=0.2, recurrent_dropout = 0.2, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units = 32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimzer=Adam(lr=3e-4)

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

history=model.fit(training_padded,training_targets,batch_size=4,epochs=10,validation_data=(validation_padded,validation_targets),verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 156, 100)          1272600   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 156, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 156, 128)          63744     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                31104     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,371,673
Trainable params: 99,073
Non-trainable params: 1,272,600
_________________________________________

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
test_csv.head()

In [None]:
test_csv["keyword"] = test_csv["keyword"].fillna(' ')
test_csv['joined'] = test_csv[['keyword', 'text']].apply(lambda x: ' '.join(x), axis = 1)
test_csv['joined'] = test_csv['joined'].apply(lambda x : data_preprocessing(x))
test_csv.head()

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_csv['joined'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
predictions = model.predict(test_padded)

In [None]:
sample_submission_csv['target'] = (predictions > 0.5).astype(int)

In [None]:
sample_submission_csv.to_csv("submission.csv", index=False, header=True)

In [None]:
#https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel

from IPython.display import HTML
import base64

def create_download_link(df, title = "Download CSV file", filename = "subm.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)



create_download_link(sample_submission_csv)
