In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

## 1. Investigate train dataset to understand what kind of data is presented and which of it can be useful

### a) train and test analysis

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head()

In [None]:
train_data.tail()

In [None]:
train_data.describe()

In [None]:
train_data.dtypes

In [None]:
train_data.duplicated().sum()

In [None]:
#check non-empty keyword examples
train_data[train_data.keyword.notna()].head()

In [None]:
# what kind of keyword is presented?
# check all unique keyword values

train_data.keyword.unique()

In [None]:
# how many tweets from train dataset have non-empty keyword?
print((train_data[train_data.keyword.notna()].shape[0]/train_data.shape[0]) * 100, '%')

# almost every record has keyword so it can help to make a better prediction

In [None]:
# how many tweets from train dataset have non-empty location feature?
print((train_data[train_data.location.notna()].shape[0]/train_data.shape[0]) * 100, '%')

In [None]:
# 'location' feature has significant amout of missing values
# lets check what kind of words this feature contains

train_data.location.unique()[:40]

'Location' feature seems not really informative so I'm going to focus on 'text' and 'keyword' features at this moment 

### b) Target value

In [None]:
# check whether target value is skewed

train_data.target.value_counts().plot.bar()

# it is clear that target value is not skewed 

## 2. Clean the data

In [None]:
# to lower
def to_lower(text):
    return text.lower()

example="Hi! My name is Pete."
to_lower(example)

In [None]:
#remove stopwords
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop])

example = 'I love my cat and dog. You should see them'
print(remove_stopwords(example))

In [None]:
# URL

import re
from urllib.parse import urlparse

def remove_url(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r'',text)

def find_url(text):
    return " ".join([urlparse(match.group(0)).netloc for match in re.finditer(r"https?://\S+|www\.\S+", text)]) or 'no'

    
example = "New competition launched here :https://www.kaggle.com/c/nlp-getting-started or here: http://www.kaggle.com/c/nlp-getting-started"
print(remove_url(example))
print(find_url(example))

In [None]:
# HASHTAGS

def remove_hashtags(text):
    hashtag = re.compile(r'#\w+')
    return hashtag.sub(r'', text)

def find_hashtags(text):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", text)]) or 'no'

example = 'Hi#hhhh #ttttt'
print(remove_hashtags(example))
print(find_hashtags(example))

In [None]:
# MENTIONS

def remove_mentions(text):
    mention = re.compile(r'@\w+')
    return mention.sub(r'', text)

def find_mentions(text):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"@\w+", text)]) or 'no'

example = 'Hi @POTUS yeeeeeh'
print(remove_mentions(example))
print(find_mentions(example))

In [None]:
# remove all non-alphabetic chars (punctuation, numbers, emojies ...)
def remove_non_alphabetic(text):
    alpha = re.compile(r'[^a-zA-Z]') 
    return alpha.sub(r' ', text)

In [None]:
def process_text(text):
    text = to_lower(text)
    text = remove_url(text)
    text = remove_hashtags(text)
    text = remove_mentions(text)
    text = remove_non_alphabetic(text)
    text = remove_stopwords(text)
    return text

In [None]:
y = train_data.pop('target')
y

In [None]:
data = pd.concat([train_data, test_data])
print(data.shape)

In [None]:
# fill na keywords with 'na' label

data.keyword.fillna('na', inplace = True) 
data.head()

In [None]:
data.keyword = data.keyword.astype(str)
data.text = data.text.astype(str)
data.head()

In [None]:
columns = ['keyword', 'text']
for c in columns:
    data[c + '_clean'] = data[c].apply(process_text)

#add new features
data['mentions'] = data.text.apply(find_mentions)
data['hashtags'] = data.text.apply(find_hashtags)
data['url'] = data.text.apply(find_url)
data.head(10)

In [None]:
#let's check some random tweets
import random

# generate 20 random indices
rand_idx = [random.randint(0, data.shape[0]) for i in range(20)]
for i in rand_idx:
    print(i, data.text_clean.iloc[i])

### Word frequency

In [None]:
# get word frequency to check whether some strange outliers are presented
def create_corpus(column_series):
    return column_series.tolist()

In [None]:
corpus_disaster = create_corpus(data.text_clean[y == 1])
d_dis = {}
for s in corpus_disaster:
    for word in s.split():
        d_dis[word] = d_dis.get(word, 0) + 1
            
corpus_non_disaster = create_corpus(data.text_clean[y == 0])
d_non_dis = {}
for s in corpus_non_disaster:
    for word in s.split():
        d_non_dis[word] = d_non_dis.get(word, 0) + 1
len(d_dis), len(d_non_dis)

In [None]:
sorted_d_dis = sorted(d_dis.items(), key = lambda x:x[1])
sorted_d_non_dis = sorted(d_non_dis.items(), key = lambda x:x[1])

In [None]:
# most frequent words
df1 = pd.DataFrame(sorted_d_dis[-30:], columns = ['word', 'freq'])
df1['target'] = 1

df2 = pd.DataFrame(sorted_d_non_dis[-30:], columns = ['word', 'freq'])
df2['target'] = 0

In [None]:
import seaborn as sns
df = pd.concat([df1, df2])

plt.figure(figsize = (10, 20))
sns.barplot(y = "word", hue = "target", x = "freq", data=df.sort_values(by = 'freq', ascending=False), orient = 'h')
plt.show()

## 3. Preparing text corpus: sequences, OHE matrix, stemming, lemmatization

### a) Get data corpus/OHE matrix/sequences

In [None]:
corpus_text = create_corpus(data.text_clean)
corpus_text[0]

In [None]:
#remove all spaces in keyword feature to get only one word as a keyword
corpus_keyword = create_corpus(data.keyword_clean.str.replace(" ", ""))
corpus_keyword[100]

In [None]:
# keyword_clean mentions hashtags url - CountVectorizer
# text_clean - TF-idf Transformer

from sklearn.feature_extraction.text import CountVectorizer

# keyword_clean
vec_kw = CountVectorizer()
matrix_kw = vec_kw.fit_transform(corpus_keyword)
data_kw = pd.DataFrame(matrix_kw.toarray(), columns=vec_kw.get_feature_names_out())

# URL
vec_url = CountVectorizer(min_df = 3)
matrix_url = vec_url.fit_transform(data.url)
data_url = pd.DataFrame(matrix_url.toarray(), columns=vec_url.get_feature_names_out())

# mentions
vec_mentions = CountVectorizer(min_df = 5)
matrix_mentions = vec_mentions.fit_transform(data.mentions)
data_mentions = pd.DataFrame(matrix_mentions.toarray(), columns = vec_mentions.get_feature_names_out())

# hashtags
vec_hashtags = CountVectorizer(min_df = 5)
matrix_hashtags = vec_hashtags.fit_transform(data.hashtags)
data_hashtags = pd.DataFrame(matrix_hashtags.toarray(), columns = vec_hashtags.get_feature_names_out())

In [None]:
# Tf-idf for text_clean
from sklearn.feature_extraction.text import TfidfVectorizer

vec_text = TfidfVectorizer(min_df = 5, ngram_range = (1,2)) 
text_vec = vec_text.fit_transform(corpus_text)
data_text_clean = pd.DataFrame(text_vec.toarray(), columns=vec_text.get_feature_names_out())

In [None]:
# there is no data_kw!
# run models with diff list features and found out that without data_kw we get better result!
df_clean = pd.concat([data_url, data_mentions, data_hashtags, data_text_clean], axis = 1)
df_clean.shape

### b) Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def corpus_lem(corpus):
    return [' '.join(wordnet_lemmatizer.lemmatize(word, pos = 'v') for word in sentence.split()) for sentence in corpus]

In [None]:
# text feature lemmitizing
corpus_text_lem = corpus_lem(corpus_text)
corpus_text[2], corpus_text_lem[2], corpus_text[8], corpus_text_lem[8]

In [None]:
# text_clean feature
vec_text_lem = TfidfVectorizer(min_df = 5, ngram_range = (1,2)) 
text_lem_vec = vec_text_lem.fit_transform(corpus_text_lem)
data_text_clean_lem = pd.DataFrame(text_lem_vec.toarray(), columns = vec_text_lem.get_feature_names_out())

In [None]:
df_lem = pd.concat([data_url, data_mentions, data_hashtags, data_text_clean_lem], axis = 1)
df_lem.shape

### c) Stemming

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def corpus_stemming(corpus):
    return [' '.join(ps.stem(word) for word in sentence.split()) for sentence in corpus]

In [None]:
# text feature stemming
corpus_text_stem = corpus_stemming(corpus_text)
corpus_text[0], corpus_text_stem[0], corpus_text[1], corpus_text_stem[1]

In [None]:
# text_clean feature
vec_text_stem = TfidfVectorizer(min_df = 5, ngram_range = (1,2)) 
text_stem_vec = vec_text_stem.fit_transform(corpus_text_stem)
data_text_clean_stem = pd.DataFrame(text_stem_vec.toarray(), columns = vec_text_stem.get_feature_names_out())

In [None]:
df_stem = pd.concat([data_url, data_mentions, data_hashtags, data_text_clean_stem], axis = 1)
df_stem.shape

## 4. Brute Force: Bag of Words

#### Sequence of Dense layers for OHE train data

In [None]:
from keras.models import Sequential, Model
from keras import Input
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, SpatialDropout1D, Concatenate
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from keras import regularizers

In [None]:
def print_loss_acc(history):
    loss = history.history['loss']
    acc = history.history['accuracy']
    val_loss = history.history['val_loss']
    val_acc = history.history['val_accuracy']
    
    epochs = range(1, len(loss) + 1)
    
    plt.figure(figsize=(16, 5))
    #accuracy
    plt.subplot(1,2,1)
    plt.plot(epochs, acc, 'bo', label = 'Training accuracy')
    plt.plot(epochs, val_acc, 'r', label = 'Validation accuracy')
    plt.legend()
    
    #loss
    plt.subplot(1,2,2)
    plt.plot(epochs, loss, 'bo', label = 'Trainig loss')
    plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
    plt.legend()
    
    plt.show()

In [None]:
def submission(y, file_name):
    y=np.round(y).astype(int).reshape(test_data.shape[0])
    sub=pd.DataFrame({'id': test_data.id,'target':y})
    sub.to_csv(file_name,index=False)
    sub.head()

In [None]:
df_clean_train = df_clean.iloc[:train_data.shape[0]]
df_clean_test = df_clean.iloc[train_data.shape[0]:]

df_lem_train = df_lem.iloc[:train_data.shape[0]]
df_lem_test = df_lem.iloc[train_data.shape[0]:]

df_stem_train = df_stem.iloc[:train_data.shape[0]]
df_stem_test = df_stem.iloc[train_data.shape[0]:]

### Logistic regression models

In [None]:
# build simple logistic regression (+ Random Forest Classifier)
# to use their scores as basic
# if simple classifier show some good results probably it won't be necessary to build more complicated NN model (spoiler: no)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

log_reg_model = LogisticRegression(max_iter = 500, random_state = 42)
log_reg_score = cross_val_score(log_reg_model, df_clean_train, y, cv = 5)
log_reg_score_lem = cross_val_score(log_reg_model, df_lem_train, y, cv = 5)
log_reg_score_stem = cross_val_score(log_reg_model, df_stem_train, y, cv = 5)
print('Data clean: %.2f', log_reg_score.mean())
print('Data lemm: %.2f', log_reg_score_lem.mean())
print('Data stem: %.2f', log_reg_score_stem.mean())

In [None]:
rand_forest_cl = RandomForestClassifier(n_estimators = 1000, max_depth = 30, random_state = 42)

rand_forest_score = cross_val_score(rand_forest_cl, df_clean_train, y, cv = 3)
rand_forest_score_lem = cross_val_score(rand_forest_cl, df_lem_train, y, cv = 3)
rand_forest_score_stem = cross_val_score(rand_forest_cl, df_stem_train, y, cv = 3)

print('Data clean: %.2f', rand_forest_score.mean())
print('Data lemm: %.2f', rand_forest_score_lem.mean())
print('Data stem: %.2f', rand_forest_score_stem.mean())

### NN models

In [None]:
# build simple NN model

def get_model(shape):
    model = Sequential(
            [
                Dense(64, activation = 'relu', input_shape = (shape,), kernel_regularizer = regularizers.l2(0.011)),
                Dropout(0.1),
                Dense(16, activation = 'relu', kernel_regularizer = regularizers.l2(0.004)),
                Dense(1, activation = 'sigmoid')
            ])

    optimzer = Adam(learning_rate = 1e-3)
    model.compile(loss = 'binary_crossentropy', optimizer = optimzer, metrics = ['accuracy'])
    return model

In [None]:
# acc ~0.75
model = get_model(df_clean_train.shape[1])
history = model.fit(df_clean_train.to_numpy(), y, batch_size = 128, epochs = 6, validation_split = 0.2, verbose = 0)
print_loss_acc(history)

In [None]:
model_lem = get_model(df_lem_train.shape[1])
history_lem = model_lem.fit(df_lem_train.to_numpy(), y, batch_size = 128, epochs = 10, validation_split = 0.2, verbose = 0)
print_loss_acc(history_lem)

In [None]:
model_stem = get_model(df_stem_train.shape[1])
history_stem = model_stem.fit(df_stem_train.to_numpy(), y, batch_size = 128, epochs = 10, validation_split = 0.2, verbose = 0)
print_loss_acc(history_stem)

In [None]:
submission(model_lem.predict(df_lem_test.to_numpy()), '1 BoW_lem.csv')

#### Embedding layer + Recurrent layer (LSTM) and fitting on sequences

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# text feature

tokenizer_text = Tokenizer()
tokenizer_text.fit_on_texts(corpus_text)
seq_text = tokenizer_text.texts_to_sequences(corpus_text)
vocab_size_text = len(tokenizer_text.word_index)

max_len_text = 50
seq_text = pad_sequences(seq_text, maxlen = max_len_text, truncating = 'post', padding = 'post')
vocab_size_text

#### lemmatizated text corpus

In [None]:
tokenizer_text_lem = Tokenizer()
tokenizer_text_lem.fit_on_texts(corpus_text_lem)
matrix_text_lem = tokenizer_text_lem.texts_to_matrix(corpus_text_lem)
seq_text_lem = tokenizer_text_lem.texts_to_sequences(corpus_text_lem)
vocab_size_text_lem = len(tokenizer_text_lem.word_index)

seq_text_lem = pad_sequences(seq_text_lem, maxlen = max_len_text, truncating = 'post', padding = 'post')
vocab_size_text_lem

In [None]:
# original sequences
seq_text_train = seq_text[:train_data.shape[0]]
seq_text_test = seq_text[train_data.shape[0]:]

#lemmatized text 
seq_text_lem_train = seq_text_lem[:train_data.shape[0]]
seq_text_lem_test = seq_text_lem[train_data.shape[0]:]

## 5. GloVe

In [None]:
# load 100 dimensional GloVe list of words

embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
list(embedding_dict.items())[0]

In [None]:
# this part here just to play with GloVe
from scipy import spatial

def find_similar_word(emmbedes):
    return sorted(embedding_dict.keys(), 
                  key=lambda word: spatial.distance.euclidean(embedding_dict[word], emmbedes))

In [None]:
# this code is here just to play with GloVe
find_similar_word(embedding_dict['pilot'])[:10]

In [None]:
# text feature
vocab_size = vocab_size_text + 1
embedding_matrix_text = np.zeros((vocab_size, 100))

for word,i in tokenizer_text.word_index.items():
    if i > vocab_size:
        continue
    
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix_text[i] = emb_vec

In [None]:
# text feature
vocab_size_lem = vocab_size_text_lem + 1
embedding_matrix_text_lem = np.zeros((vocab_size_lem, 100))

for word,i in tokenizer_text_lem.word_index.items():
    if i > vocab_size_lem:
        continue
    
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix_text_lem[i] = emb_vec

### Model

In [None]:
# Embedding(GloVe) + Recurrent layer (LSTM)

model = Sequential(
        [
            Embedding(vocab_size, 100, input_length = max_len_text, 
                      embeddings_initializer = Constant(embedding_matrix_text), trainable = False),
            LSTM(70),
            Dense(64, activation = 'relu'),
            Dense(1, activation = 'sigmoid')
        ])

optimzer = Adam(learning_rate = 1e-4)
model.compile(loss = 'binary_crossentropy', optimizer = optimzer, metrics = ['accuracy'])

In [None]:
# acc ~0.80
history = model.fit(x = seq_text_train, y = y, batch_size = 128, epochs = 15, validation_split = 0.2, verbose = 0)
print_loss_acc(history)

In [None]:
# Embedding(GloVe) + Recurrent layer (LSTM) + Lemmatization

model_lem = Sequential(
        [
            Embedding(vocab_size_lem, 100, input_length = max_len_text, 
                      embeddings_initializer = Constant(embedding_matrix_text_lem), trainable = False),
            SpatialDropout1D(0.1),
            LSTM(80),
            Dense(64, activation = 'relu'),
            Dense(1, activation = 'sigmoid')
        ])

optimzer = Adam(learning_rate = 1e-4)
model_lem.compile(loss = 'binary_crossentropy', optimizer = optimzer, metrics = ['accuracy'])

In [None]:
# acc ~0.80
history = model_lem.fit(x = seq_text_lem_train, y = y, batch_size = 128, epochs = 15, validation_split = 0.2, verbose = 0)
print_loss_acc(history)

In [None]:
submission(model.predict(seq_text_test), '2 Glove.csv')
submission(model_lem.predict(seq_text_lem_test), '3 Glove + Lemm.csv')

In [None]:
matrix_feat = pd.concat([data_kw, data_mentions, data_hashtags], axis = 1).to_numpy() # remove data_url

matrix_train = matrix_feat[:train_data.shape[0]]
matrix_test = matrix_feat[train_data.shape[0]:]

In [None]:
seq_text_train.shape, matrix_train.shape

In [None]:
# lets try to improve score adding other features

input_text = Input(shape = (max_len_text, ))
emb_layer = Embedding(vocab_size, 100, embeddings_initializer = Constant(embedding_matrix_text), trainable = False)(input_text)
lstm_layer = LSTM(70)(emb_layer)
dense_layer_1 = Dense(64, activation = 'relu')(lstm_layer)

input_feat = Input(shape = (matrix_feat.shape[1], ))
dense_layer_2 = Dense(64, activation = 'relu')(input_feat)

concat_layer = Concatenate()([dense_layer_1, dense_layer_2])
dense_layer_3 = Dense(16, activation = 'relu')(concat_layer)
output = Dense(1, activation = 'sigmoid')(dense_layer_3)

model = Model(inputs = [input_text, input_feat], outputs = output)

optimzer = Adam(learning_rate = 1e-4)
model.compile(loss = 'binary_crossentropy', optimizer = optimzer, metrics = ['accuracy'])

In [None]:
# acc ~0.80
history = model.fit(x = [seq_text_train, matrix_train], y = y, batch_size = 128, epochs = 15, validation_split = 0.2, verbose = 0)
print_loss_acc(history)

In [None]:
submission(model.predict([seq_text_test, matrix_test]), '4 Glove + Feat.csv')