In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
import nltk

from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant

In [None]:
train = pd.read_csv(r'../input/nlp-getting-started/train.csv')
test = pd.read_csv(r'../input/nlp-getting-started/test.csv')
sub = pd.read_csv(r'../input/nlp-getting-started/sample_submission.csv')

In [None]:
pd.set_option('display.max_colwidth',200)

In [None]:
train.head()

In [None]:
train.shape, test.shape, sub.shape

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.drop(['id','keyword','location'],axis=1,inplace=True)
test.drop(['id','keyword','location'],axis=1,inplace=True)

Let’s check out a few disaster related text .

In [None]:
train[train['target']==1].head()

Let’s check out a few normal text.

In [None]:
train[train['target']==0].head()

There are quite a many words and characters which are not really required. So, we will try to keep only those words which are important and add value.

Let’s have a glimpse at target-distribution in the train dataset.

In [None]:
train['target'].value_counts()

Now we will check the distribution of length of the tweets, in terms of words, in both train and test data.

In [None]:
length_train = train['text'].str.len()
length_test = test['text'].str.len()
plt.hist(length_train,bins=20,label = 'train_text')
plt.hist(length_test,bins=20,label='test_text')
plt.legend()
plt.show()

**Data Cleaning**

In any natural language processing task, cleaning raw text data is an important step. It helps in getting rid of the unwanted words and characters which helps in obtaining better features. If we skip this step then there is a higher chance that you are working with noisy and inconsistent data. The objective of this step is to clean noise those are less relevant to find the sentiment of tweets such as punctuation, special characters, numbers, and terms which don’t carry much weightage in context to the text.

Given below is a user-defined function to remove unwanted text patterns from the text.

In [None]:
def remove_pattern(input_txt,pattern):
    r=re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

**Removing Twitter handles (@user)** 

In [None]:
train['text'] = np.vectorize(remove_pattern)(train['text'],'@[/w]*')
test['text'] = np.vectorize(remove_pattern)(test['text'],'@[/w]*')

**Removing Punctuations, Numbers, and Special Characters**

Here we will replace everything except characters and hashtags with spaces. The regular expression “[^a-zA-Z#]” means anything except alphabets and ‘#’.

In [None]:
train['text'] = train['text'].str.replace('[^a-zA-Z#]',' ')
test['text'] = test['text'].str.replace('[^a-zA-Z#]',' ')

**Removing Short Words**

We have to be a little careful here in selecting the length of the words which we want to remove. So, I have decided to remove all the words having length 2 or less. For example, terms like “hmm”, “oh” are of very little use. It is better to get rid of them.

In [None]:
train['text'] = train['text'].apply(lambda x: ' '.join([w for w in x.split()
                                                                 if len(w)>2]))
test['text'] = test['text'].apply(lambda x: ' '.join([w for w in x.split()
                                                                 if len(w)>2]))
train.head()

**Understanding the common words used in the texts: WordCloud**

Now I want to see how well the given sentiments are distributed across the train dataset. One way to accomplish this task is by understanding the common words by plotting wordclouds.

A wordcloud is a visualization where in the most frequent words appear in large size and the less frequent words appear in smaller sizes.

Let’s visualize all the words our data using the wordcloud plot.

In [None]:
all_words = ' '.join([text for text in train['text']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height = 500, max_font_size=110).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
normal_words = ' '.join([text for text in train['text'][train['target']==0]])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height = 500, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
disaster_related_words = ' '.join([text for text in train['text'][train['target']==1]])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height = 500, max_font_size=110).generate(disaster_related_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

**Understanding the impact of Hashtags on texts sentiment**

Hashtags in twitter are synonymous with the ongoing trends on twitter at any particular point in time. We should try to check whether these hashtags add any value in this task.

In [None]:
# function to collect hashtags
def hashtag_extract(x):   
            hashtags = []    

            # Loop over the words in the tweet    
            for i in x:        
                ht = re.findall(r"#(\w+)", i)        
                hashtags.append(ht)     
            return hashtags

# extracting hashtags from non racist/sexist tweets 
HT_regular = hashtag_extract(train['text'][train['target'] == 0]) 

# extracting hashtags from racist/sexist tweets 
HT_negative = hashtag_extract(train['text'][train['target'] == 1]) 

# unnesting list 
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [None]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                 'Count': list(a.values())})

#selecting top 20 most frequent hashtags
d = d.nlargest(columns = 'Count',n=20)
plt.figure(figsize=(16,5))
ax = sns.barplot(data = d, x='Hashtag', y='Count')
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()),
                 'Count': list(b.values())})

#selecting top 20 most frequent hashtags
e = e.nlargest(columns = 'Count',n=20)
plt.figure(figsize=(16,5))
ax = sns.barplot(data = e, x='Hashtag', y='Count')
ax.set(ylabel = 'Count')
plt.show()

### GloVe

I will use GloVe pretrained corpus model to represent our words.

In [None]:
data = pd.concat([train,test])
data.shape

In [None]:
def create_corpus_new(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet)]
        corpus.append(words)
    return corpus   

In [None]:
corpus = create_corpus_new(data)

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec    

In [None]:
tweet_pad[0][0:]

### Model Building

In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
model.summary()

In [None]:
train_new = tweet_pad[:train.shape[0]]
test_new = tweet_pad[train.shape[0]:]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train_new,train['target'].values,test_size=0.2)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
# Recomended 10-20 epochs
history=model.fit(X_train,y_train,batch_size=128,epochs=10,validation_data=(X_test,y_test),verbose=2)

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

### Prediction

In [None]:
test_pred= model.predict(test_new)
test_pred_int = test_pred.round().astype('int')
sub['target'] = test_pred_int
sub.to_csv('pred.csv',index=False)

In [None]:
sub