In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The author Fanglida Yan has used code from these references in the notebook. <br>
https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e <br>
https://www.youtube.com/watch?v=hhjn4HVEdy0 <br>

0. lower case <br>
1. turn key words into lists <br>
2. extract hashtags and create a new feature column <br>
3. remove digits (01234), urls (http://...), mentions (@...) and hashtags (#...) <br>
4. recover abbreviations (change they'll to they will, etc) <br>
5. remove punctuations <br>
6. tokenization <br>
7. remove stop words <br>
8. lemmatization <br>
12. word embedding

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

**0. lower case**

In [None]:
train['text']=train['text'].apply(lambda x : x.lower())
test['text']=test['text'].apply(lambda x : x.lower())

def lower_keywords(keywords):
    if keywords == keywords:
        keywords=keywords.lower()
    return keywords
        
train['keyword']=train['keyword'].apply(lambda x : lower_keywords(x))
test['keyword']=test['keyword'].apply(lambda x : lower_keywords(x))

test.head()

**1. turn keywords into lists**

In [None]:
def keywords_to_list(keywords):
    if keywords!=keywords: # nan value is not equal to itself
        return []
    else:
        return keywords.split('%20')
            
train['keyword']=train['keyword'].apply(lambda x : keywords_to_list(x))
test['keyword']=test['keyword'].apply(lambda x : keywords_to_list(x))

test.head()

**2. extract hashtags and create a new feature column**

In [None]:
import re
train['hashtag'] = train['text'].apply(lambda x: re.findall(r'#(\w+)', x))
test['hashtag'] = test['text'].apply(lambda x: re.findall(r'#(\w+)', x))

test.head(20)

**3. remove digits (01234), urls (http://...), mentions (@...) and hashtags (#...)**

In [None]:
!pip install tweet-preprocessor
import preprocessor

In [None]:
train['text'] = train['text'].apply(lambda x: preprocessor.clean(x))
test['text'] = test['text'].apply(lambda x: preprocessor.clean(x))

def clear_list(lista):
    try:
        for i,ele in enumerate(lista):
            lista[i]=preprocessor.clean(ele)
        return lista
    except:
        print(lista)

train['hashtag'] = train['hashtag'].apply(lambda x: clear_list(x))
test['hashtag'] = test['hashtag'].apply(lambda x: clear_list(x))

train['keyword'] = train['keyword'].apply(lambda x: clear_list(x))
test['keyword'] = test['keyword'].apply(lambda x: clear_list(x))

test.head()

**4. recover abbreviations (change they'll to they will, etc)**

I copied the code from the follow url by Yann Dubois <br>
https://stackoverflow.com/questions/43018030/replace-apostrophe-short-words-in-python

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

train['text'] = train['text'].apply(lambda x: decontracted(x))
test['text'] = test['text'].apply(lambda x: decontracted(x))

**5. remove punctuations**

In [None]:
import re

def remove_punc(lista):
    for i,ele in enumerate(lista):
        lista[i] = re.sub(r'[^\w\s]', '', ele)
        lista[i] = re.sub('_', ' ', lista[i]) # the previous row doesn't remove underscore
    return lista

train['text']=train['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
train['text']=train['text'].apply(lambda x: re.sub('_', ' ', x)) # the previous row doesn't remove underscore
test['text']=test['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['text']=test['text'].apply(lambda x: re.sub('_', ' ', x)) # the previous row doesn't remove underscore

train['hashtag']=train['hashtag'].apply(lambda x: remove_punc(x))
test['hashtag']=test['hashtag'].apply(lambda x: remove_punc(x))

train['keyword']=train['keyword'].apply(lambda x: remove_punc(x))
test['keyword']=test['keyword'].apply(lambda x: remove_punc(x))

test.head()

**6. tokenization**

In [None]:
import nltk

train['text']=train['text'].apply(lambda x: nltk.word_tokenize(x))
test['text']=test['text'].apply(lambda x: nltk.word_tokenize(x))

test.head(10)

**7. remove stop words**

In [None]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
stop_words.append('u') # 'i love u' is the semantically the same as 'i love you'
stop_words.append('one') # want to remove numbers
stop_words.append('two')
stop_words.append('three')
stop_words.append('four')
stop_words.append('five')
stop_words.append('six')
stop_words.append('seven')
stop_words.append('eight')
stop_words.append('nine')
stop_words.append('ten')

def remove_stop_words(lista):
    pt=0 # don't use a for loop because len(lista) keeps changing as we remove stop words.
    while pt<len(lista):
        if lista[pt] in stop_words:
            lista.remove(lista[pt])
        else:
            pt+=1
    return lista

train['text']=train['text'].apply(lambda x : remove_stop_words(x))
train['hashtag']=train['hashtag'].apply(lambda x : remove_stop_words(x))
train['keyword']=train['keyword'].apply(lambda x : remove_stop_words(x))

test['text']=test['text'].apply(lambda x : remove_stop_words(x))
test['hashtag']=test['hashtag'].apply(lambda x : remove_stop_words(x))
test['keyword']=test['keyword'].apply(lambda x : remove_stop_words(x))

**8. lemmatization. ('us' is lemmatized to 'u')** 

In [None]:
from nltk import WordNetLemmatizer 

In [None]:
WordNetLemmatizer().lemmatize('us')

In [None]:
def lemmatize_list(lista):
    for i, ele in enumerate(lista):
        lista[i]=WordNetLemmatizer().lemmatize(ele)
    return lista

train['text']=train['text'].apply(lambda x : lemmatize_list(x))
train['hashtag']=train['hashtag'].apply(lambda x : lemmatize_list(x))
train['keyword']=train['keyword'].apply(lambda x : lemmatize_list(x))

test['text']=test['text'].apply(lambda x : lemmatize_list(x))
test['hashtag']=test['hashtag'].apply(lambda x : lemmatize_list(x))
test['keyword']=test['keyword'].apply(lambda x : lemmatize_list(x))

test.head()

**9. save the preprocessed files**

In [None]:
train.to_csv('preprocess_train.csv')
test.to_csv('preprocess_test.csv')

**10. find maximum tweet length, maximum hashtag length, maximum keywords length**

In [None]:
maxi=0

for ele in train['text']:
    maxi=max(maxi,len(ele))
    
for ele in test['text']:
    maxi=max(maxi,len(ele))
    
maxi_text=maxi
maxi_text

In [None]:
maxi=0

for ele in train['hashtag']:
    maxi=max(maxi,len(ele))
    
for ele in test['hashtag']:
    maxi=max(maxi,len(ele))
    
maxi
maxi_hashtag=maxi
maxi_hashtag

In [None]:
maxi=0

for ele in train['keyword']:
    maxi=max(maxi,len(ele))
    
for ele in test['keyword']:
    maxi=max(maxi,len(ele))
    
maxi_keyword=maxi
maxi_keyword

**11. install and understand word embedding**

In [None]:
!pip3 install spacy
!python3 -m spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
import math
def distance(vec1,vec2):
    sum=0
    for i in range(len(vec1)):
        sum+=(vec1[i]-vec2[i])**2
    return math.sqrt(sum)

In [None]:
doc = nlp("father grandfather") # change your words here 

In [None]:
doc[0].vector.shape

In [None]:
print(distance(doc[0].vector, doc[1].vector)) # the smaller the more similar
print(doc[0].similarity(doc[1])) # the larger the more similar

**12. use word embedding and create training set, the training set has dimension (m,23+13+2,300)**

In [None]:
m=train.shape[0]
store_train=np.zeros((m,23+13+2,300))
for i in range(m): # m
    if i % 100 == 99:
        print(i)
    for j in range(len(train['text'][i])): # length of the list ['love','peace','compassion','wisdom']        
        store_train[i,j,:]=nlp(train['text'][i][j])[0].vector
    for j in range(len(train['hashtag'][i])):
        try:
            store_train[i,23+j,:]=nlp(train['hashtag'][i][j])[0].vector
        except:
            store_train[i,23+j,:]=nlp(train['hashtag'][i][j]).vector # in the case when hashtag is [''] instead of ['some','word']
    for j in range(len(train['keyword'][i])):
        store_train[i,36+j,:]=nlp(train['keyword'][i][j])[0].vector

In [None]:
np.save('store_train.npy',store_train)

In [None]:
m=test.shape[0]
store_test=np.zeros((m,23+13+2,300))
for i in range(m): # m
    if i % 100 == 99:
        print(i)
    for j in range(len(test['text'][i])): # length of the list ['love','peace','compassion','wisdom']        
        store_test[i,j,:]=nlp(test['text'][i][j])[0].vector
    for j in range(len(test['hashtag'][i])):
        try:
            store_test[i,23+j,:]=nlp(test['hashtag'][i][j])[0].vector
        except:
            store_test[i,23+j,:]=nlp(test['hashtag'][i][j]).vector # in the case when hashtag is [''] instead of ['some','word']
    for j in range(len(test['keyword'][i])):
        store_test[i,36+j,:]=nlp(test['keyword'][i][j])[0].vector

In [None]:
np.save('store_test.npy',store_test)

**13. slice the train and test sets**

In [None]:
store_train_text=store_train[:,:maxi_text,:]
store_train_hashtag=store_train[:,maxi_text:maxi_text+maxi_hashtag,:]
store_train_keyword=store_train[:,-maxi_keyword:,:]

In [None]:
print(store_train_text.shape)
print(store_train_hashtag.shape)
print(store_train_keyword.shape)

In [None]:
store_test_text=store_train[:,:maxi_text,:]
store_test_hashtag=store_train[:,maxi_text:maxi_text+maxi_hashtag,:]
store_test_keyword=store_train[:,-maxi_keyword:,:]

**14. build the model**

In [None]:
import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Dropout, GRU, BatchNormalization, TimeDistributed, Reshape, Dense, Conv1D, Concatenate
from keras import Model
import keras

In [None]:
# input_text=Input(shape=(store_train_text.shape[1],store_train_text.shape[2]))
# input_hashtag=Input(shape=(store_train_hashtag.shape[1],store_train_hashtag.shape[2]))
# input_keyword=Input(shape=(store_train_keyword.shape[1],store_train_keyword.shape[2]))

# mid1=GRU(units=128, return_sequences=True)(input_text)
# mid1=Dropout(0.8)(mid1)
# mid1=BatchNormalization()(mid1)  

# mid1=GRU(units=16, return_sequences=True)(mid1)
# mid1=Dropout(0.8)(mid1)
# mid1=BatchNormalization()(mid1)  

# mid1=GRU(units=1, return_sequences=False)(mid1)
# #mid1=Dropout(0.8)(mid1)
# #mid1=BatchNormalization()(mid1)
# print(mid1.shape)

# # mid1=Dropout(0.8)(mid1)
# # mid1=TimeDistributed(Dense(1, activation = "relu"))(mid1)
# # mid1=Reshape((mid1.shape[1],))(mid1)
# # # mid1 has shape (m,23)

# mid2=TimeDistributed(Dense(128, activation = "relu"))(input_hashtag)
# # kernel_size=1 makes the conv1d the same as TimeDistributed(Dense)
# print(mid2.shape)
# mid2=Conv1D(1, kernel_size=1, strides=1, padding='valid')(mid2)
# mid2=Reshape((mid2.shape[1],))(mid2)
# # now mid2 has shape (m,13)
# #print(mid2.shape)

# # mid3=Conv1D(30, kernel_size=1, strides=1, padding='valid')(input_keyword)
# # # kernel_size=1 makes the conv1d the same as TimeDistributed(Dense)
# # mid3=Conv1D(1, kernel_size=1, strides=1, padding='valid')(mid3)
# # mid3=Reshape((mid3.shape[1],))(mid3)

# # mid=Concatenate(axis=-1)([mid1,mid2,mid3])
# # #print(mid1.shape,mid2.shape,mid3.shape)
# # output=Dense(2, activation="softmax")(mid)
# # #print(output.shape)

# # model=Model(inputs=[input_text,input_hashtag, input_keyword], outputs=outputs)
# # model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='categorical_crossentropy')

In [None]:
inp=Input(shape=(store_train.shape[1],store_train_text.shape[2]))

mid=GRU(units=300, return_sequences=True)(inp)
mid=Dropout(0.6)(mid)
mid=BatchNormalization()(mid)  

mid=GRU(units=300, return_sequences=True)(mid)
mid=Dropout(0.6)(mid)
mid=BatchNormalization()(mid)  

mid=GRU(units=300, return_sequences=True)(mid)
mid=Dropout(0.6)(mid)
mid=BatchNormalization()(mid)  

mid=Dropout(0.6)(mid)
mid=TimeDistributed(Dense(1,activation='relu'))(mid)
mid=Reshape((mid.shape[1],))(mid)
mid=Dropout(0.6)(mid)
mid=BatchNormalization()(mid) 
outp=Dense(2,activation='softmax')(mid)


model=Model(inputs=inp, outputs=outp) 

**15. create labels for the training sets**

understand  np.random.shuffle and np.random.seed

In [None]:
np.random.seed(3)
lis1=np.array([[1,1],[2,2],[3,3],[4,4],[5,5],[6,6],[7,7],[8,8],[9,9]])
lis2=np.array([1,2,3,4,5,6,7,8,9])
np.random.shuffle(lis1)
np.random.seed(3)
np.random.shuffle(lis2)
print(lis1)
print(lis2)

In [None]:
store_train=np.load('store_train.npy')

m=store_train.shape[0]
train_Y=np.zeros((m,2))
for i in range(m):
    train_Y[i,train.iloc[i]['target']]=1

In [None]:
sed=13
np.random.seed(sed)
np.random.shuffle(store_train)
np.random.seed(sed)
np.random.shuffle(train_Y)

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy',metrics='accuracy')

In [None]:
model.fit(store_train[0:-500,:,:], train_Y[0:-500,:], batch_size=64, epochs=50, verbose=1)

**16. evaluate the cross validation set**

In [None]:
model.evaluate(store_train[-500:,:,:], train_Y[-500:,:])

**17. predict the test set**

In [None]:
test_Y=model.predict(store_test)

test_label=[]

for i in range(test_Y.shape[0]):
    if test_Y[i,1]>=0.5:
        test_label.append(1)
    else:
        test_label.append(0)

**18. submit**

In [None]:
submission=pd.DataFrame({'id': test['id'], 'target':test_label})
print(submission.head(10))

filename = 'submission_nlp_tweets.csv'

submission.to_csv(filename,index=False)