In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split

import re
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[1] Load dataset, both train & test

In [None]:
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_train.head()

In [None]:
df_test.head()

[2] Preprocessing

[2-1] Drop unnecessary columns

In [None]:
df_train.drop(['keyword','location'],axis=1,inplace=True)
df_test.drop(['keyword','location'],axis=1,inplace=True)

print(f"train shape >> {df_train.shape}")
print(f"test shape >> {df_test.shape}")


df_train.head()

[2-2] Check if imbalanced

In [None]:
sns.set_style('darkgrid')
sns.countplot(x=df_train.target)
plt.show()

-> I think the balance is ok

[2-3] Text Preprocessing

(1) Remove #, @chunk, urls, shortwords(length is either 1 or 2), stopwords

In [None]:
df_train.text.head()

In [None]:
at_chunk = re.compile(r"@[a-zA-Z0-9]*")
url = re.compile(r"https?:/+[a-zA-Z0-9./]*")
shortword = re.compile(r"\b\w{1,2}\b")

stop_words = set(stopwords.words('english'))

def clean(text):
    text = re.sub('#','',text)
    text = re.sub(at_chunk,'',text)
    text = re.sub(url,'',text)
    text = re.sub(shortword,'',text)
    
    text = text.split()
    text = [w for w in text if w not in stop_words]
    
    text = " ".join(text)
    text = text.strip()
    
    return text

df_train.text = df_train.text.apply(clean)
df_test.text = df_test.text.apply(clean)

In [None]:
df_train.text.head()

(2) Check NULL

In [None]:
df_train.text.isnull().any()

(3) Tokenizer

In [None]:
tok = Tokenizer()

train_text = df_train.text
train_label = df_train.target
test_text = df_test.text

tok.fit_on_texts(train_text)

word_size = len(tok.index_word)
vocab_size = word_size+1

print(f"{word_size} words are used!")

print("Tokenizing train texts\n")
train_text = tok.texts_to_sequences(train_text)
print("Tokenizing train texts finished!\n")

print("Tokenizng test texts with the same tokenizer\n")
test_text = tok.texts_to_sequences(test_text)
print("Tokenizing test texts finished!\n")

(4) Padding

In [None]:
lengths = [len(s) for s in train_text]
print(f"Max of sequence size >> {np.max(lengths)}")
print(f"Average of sequence size >> {int(np.round(np.mean(lengths)))}")

plt.hist(lengths,bins=100)
plt.show()

sequence_size=21

In [None]:
train_text = pad_sequences(train_text,maxlen=sequence_size,padding='post',truncating='post')
test_text = pad_sequences(test_text,maxlen=sequence_size,padding='post',truncating='post')

print(f"train text shape >> {train_text.shape}")
print(f"test text shape >> {test_text.shape}")

In [None]:
train_data = train_text
test_data = test_text

[3] Modelling

(1-1) Bidirectional stacked LSTM without pre-trained Embedding Vectors from GLOVE : return only last hidden cell on the last lstm layer

In [None]:
from keras.layers import Input,Embedding,Bidirectional,LSTM,TimeDistributed,Dense,Dropout,BatchNormalization,GlobalMaxPool1D,GlobalAveragePooling1D
from keras.utils import plot_model
from keras.models import Model

#vocab_size
#sequence_size
word_vec_size = 128
hidden_size = 128

def create_lstm1():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size,mask_zero=True)(X)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=False))(H)
    H = BatchNormalization()(H)
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    
    Y = Dense(1,activation='sigmoid')(H)
    
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm1 = create_lstm1()
hist = lstm1.fit(train_data,train_label,validation_split=0.1,epochs=10,batch_size=32)

In [None]:
plot_model(lstm1)

(1-2) Bidirectional Stacked LSTM model without using pre-trained Embedding Vectors : return all the hidden cells and use global average pooling

In [None]:
def create_lstm2():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size,mask_zero=True)(X)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = GlobalAveragePooling1D()(H)
    H = BatchNormalization()(H)
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    
    Y = Dense(1,activation='sigmoid')(H)
    
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm2 = create_lstm2()
hist = lstm2.fit(train_data,train_label,validation_split=0.1,epochs=10,batch_size=32)

In [None]:
plot_model(lstm2)

(1-3) Bidirectional Stacked LSTM model without using pre-trained Embedding Vectors : return all the hidden cells and use global max pooling

In [None]:
def create_lstm3():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size,mask_zero=True)(X)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = GlobalMaxPool1D()(H)
    H = BatchNormalization()(H)
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    
    Y = Dense(1,activation='sigmoid')(H)
    
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm3 = create_lstm3()
hist = lstm3.fit(train_data,train_label,validation_split=0.1,epochs=7,batch_size=32)

In [None]:
plot_model(lstm3)

(2-1) Bidirectional Stacked LSTM model with pre-trained Embedding vectors

In [None]:
import os

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
embedding_dict = dict()

f = open(os.path.join('glove.6B.200d.txt'),encoding='utf-8')

for line in f:
    tokens = line.split()
    word = tokens[0]
    vector = tokens[1:]
    vector =  np.asarray(vector,dtype='float32')
    embedding_dict[word] = vector
    
f.close()

embedding_size = len(embedding_dict['world'])
print(f"There are {len(embedding_dict)} embedding vectors in total")
print(f"The size of embedding vector here >> {embedding_size}")

embedding_matrix =  np.zeros((vocab_size,embedding_size))
for word,idx in tok.word_index.items():
    vector = embedding_dict.get(word)
    if vector is not None:
        embedding_matrix[idx] = np.asarray(vector,dtype='float32')

In [None]:
#embedding_size
def create_lstm_glove1():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,embedding_size,input_length=sequence_size,weights=[embedding_matrix],trainable=False,mask_zero=True)(X)
    H = Dropout(0.2)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = GlobalMaxPool1D()(H)
    H = BatchNormalization()(H)
    
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm_glove1 = create_lstm_glove1()
hist = lstm_glove1.fit(train_data,train_label,validation_split=0.1,epochs=6,batch_size=32)

In [None]:
plot_model(lstm_glove1)

In [None]:
#embedding_size
def create_lstm_glove2():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,embedding_size,input_length=sequence_size,weights=[embedding_matrix],trainable=False,mask_zero=True)(X)
    H = Dropout(0.2)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Dropout(0.1)(H)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = GlobalAveragePooling1D()(H)
    H = BatchNormalization()(H)
    
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    Y = Dense(1,activation='sigmoid')(H)
    
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
lstm_glove2 = create_lstm_glove2()
hist = lstm_glove2.fit(train_data,train_label,validation_split=0.1,epochs=7,batch_size=32)

In [None]:
plot_model(lstm_glove2)

(3-1) Multi-kernel Conv1D model using pre-trained Embedding vectors

In [None]:
from keras.layers import Conv1D,Concatenate,LeakyReLU,Flatten

def create_conv1():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,embedding_size,input_length=sequence_size,mask_zero=True,weights=[embedding_matrix],trainable=True)(X)
    H = Dropout(0.1)(H)
    
    conv_blocks=[]
    kernel_filters=[256,256,128,128]
    kernel_sizes=[3,4,5,6]
    
    for i in range(len(kernel_sizes)):
        conv = Conv1D(filters=kernel_filters[i],kernel_size=kernel_sizes[i])(H)
        conv = GlobalMaxPool1D()(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    
    H = Concatenate()(conv_blocks)
    H = Dropout(0.1)(H)
    
    H = Dense(256)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    H = Dense(32)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    Y = Dense(1,activation='sigmoid')(H)
        
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
conv1 = create_conv1()
hist = conv1.fit(train_data,train_label,validation_split=0.1,epochs=7,batch_size=32)

In [None]:
plot_model(conv1)

(3-2) Multi-kernel Stacked Conv1D model using pre-trained Embedding vectors

In [None]:
def create_conv2():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,embedding_size,input_length=sequence_size,mask_zero=True,weights=[embedding_matrix],trainable=True)(X)
    H = Dropout(0.1)(H)
    
    conv_blocks=[]
    kernel_filters=[256,256,128,128]
    kernel_sizes=[3,4,5,6]
    
    for i in range(len(kernel_sizes)):
        conv = Conv1D(filters=kernel_filters[i],kernel_size=kernel_sizes[i])(H)
        conv = Conv1D(filters=kernel_filters[i],kernel_size=kernel_sizes[i])(conv)
        conv = GlobalMaxPool1D()(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    
    H = Concatenate()(conv_blocks)
    H = Dropout(0.1)(H)
    
    H = Dense(256)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    H = Dense(32)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    Y = Dense(1,activation='sigmoid')(H)
        
    model = Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

In [None]:
conv2 = create_conv2()
hist = conv2.fit(train_data,train_label,validation_split=0.1,epochs=7,batch_size=32)

In [None]:
plot_model(conv2)

(4) BERT

(5) Submission

In [None]:
#lstm2, lstm_glove1, conv1
#test_data

test_id = df_test.id

def get_submission(model,filename):
    pred = model.predict(test_data)
    pred = pred.reshape(-1)
    submission = pd.DataFrame({
        'id':test_id,
        'target':pred
    })
    submission.target = submission.target.apply(lambda x:1 if x>0.5 else 0)
    print("Making submission DataFrame Finished!")
    submission.to_csv(filename+".csv",index=False)
    print("Making CSV file Finished!\n\n")
    return submission

In [None]:
get_submission(lstm2,"lstm2")
get_submission(lstm_glove1,"lstm_glove1")
get_submission(conv1,"conv1")

print(0)