In [None]:
# Octopus ML pakage - github.com/gershonc/octopus-ml
!pip install octopus-ml

In [None]:
import warnings
warnings.simplefilter("ignore")
import seaborn as sns 
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import tracemalloc
from pandas_summary import DataFrameSummary
from sklearn.metrics import classification_report

from sklearn import feature_extraction, linear_model, model_selection, preprocessing

%matplotlib inline
sns.set_style("whitegrid")

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

#check out https://github.com/gershonc/octopus-ml
import octopus_ml as oc

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

## EDA

In [None]:
train_df.head(5)

In [None]:
# DataFrane Summary by pandas summary package (extension of pandas.describe method) 
dfs = DataFrameSummary(train_df)
dfs.summary()

In [None]:
# Target distribution analysis
fig, ax =plt.subplots(1,2)


plt.style.use('fivethirtyeight')
plt.figure(figsize=(3,4))
sns.set_context("paper", font_scale=1.2)                                                  
sns.countplot('target',data=train_df, ax=ax[0])
train_df['target'].value_counts().plot.pie(explode=[0,0.2],autopct='%1.2f%%',ax=ax[1])
fig.show()

In [None]:
def wordcount(x):
    length = len(str(x).split())
    return length
def charcount(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

def hashtag_count(x):
    l = len([t for t in x.split() if t.startswith('#')])
    return l

def mentions_count(x):
    l = len([t for t in x.split() if t.startswith('@')])
    return l


train_df['char_count'] = train_df['text'].apply(lambda x: charcount(x))
train_df['word_count'] = train_df['text'].apply(lambda x: wordcount(x))
train_df['hashtag_count'] = train_df['text'].apply(lambda x: hashtag_count(x))
train_df['mention_count'] = train_df['text'].apply(lambda x: mentions_count(x))
train_df['length']=train_df['text'].apply(len)

test_df['char_count'] = test_df['text'].apply(lambda x: charcount(x))
test_df['word_count'] = test_df['text'].apply(lambda x: wordcount(x))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: hashtag_count(x))
test_df['mention_count'] = test_df['text'].apply(lambda x: mentions_count(x))
test_df['length']=test_df['text'].apply(len)

train_df.head(2)

In [None]:
sns.displot(data = train_df, kind = 'hist', x = 'length', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)

# The distibution of tweet text length vs target - there is a correlation between tweet length and target 

In [None]:
sns.displot(data = train_df, kind = 'hist', x = 'hashtag_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)

In [None]:
sns.displot(data = train_df, kind = 'hist', x = 'word_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)


In [None]:
duplicates = pd.concat(x for _, x in train_df.groupby(["text"]) if len(x) > 1)

#with pd.option_context("display.max_rows", None, "max_colwidth", 80):
#    display(duplicates[["id", "target", "text"]])

In [None]:
# Taken from - Craig Thomas https://www.kaggle.com/craigmthomas/logistic-regression-lightgbm-fe
train_df.drop(
    [
        6449, 7034, 3589, 3591, 3597, 3600, 3603, 3604, 3610, 3613, 3614, 119, 106, 115,
        2666, 2679, 1356, 7609, 3382, 1335, 2655, 2674, 1343, 4291, 4303, 1345, 48, 3374,
        7600, 164, 5292, 2352, 4308, 4306, 4310, 1332, 1156, 7610, 2441, 2449, 2454, 2477,
        2452, 2456, 3390, 7611, 6656, 1360, 5771, 4351, 5073, 4601, 5665, 7135, 5720, 5723,
        5734, 1623, 7533, 7537, 7026, 4834, 4631, 3461, 6366, 6373, 6377, 6378, 6392, 2828,
        2841, 1725, 3795, 1251, 7607
    ], inplace=True
)

train_df.drop(
    [
        4290, 4299, 4312, 4221, 4239, 4244, 2830, 2831, 2832, 2833, 4597, 4605, 4618, 4232, 4235, 3240,
        3243, 3248, 3251, 3261, 3266, 4285, 4305, 4313, 1214, 1365, 6614, 6616, 1197, 1331, 4379, 4381,
        4284, 4286, 4292, 4304, 4309, 4318, 610, 624, 630, 634, 3985, 4013, 4019, 1221, 1349, 6091, 6094, 
        6103, 6123, 5620, 5641
    ], inplace=True
)

## Data pre-processing 

In [None]:
## for data
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer
from lime import lime_text
## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
## for bert language model
import transformers
import unicodedata

In [None]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
                            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
lst_stopwords = nltk.corpus.stopwords.words("english")
#lst_stopwords


In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring'}

def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x
    
train_df['text_clean'] = train_df['text'].apply(lambda x: cont_to_exp(x))
test_df['text_clean'] = test_df['text'].apply(lambda x: cont_to_exp(x))


def remove_emails(x):
     return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x)


def remove_urls(x):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x)

def remove_rt(x):
    return re.sub(r'\brt\b', '', x).strip()

def remove_special_chars(x):
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x


def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x



train_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_emails(x))
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_urls(x))
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_rt(x))
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_special_chars(x))
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_accented_chars(x))

In [None]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))
train_df.head()

## TFIDF 

In [None]:
vec=TfidfVectorizer(max_features = 10000,ngram_range=(1,4))
vec.fit(train_df['text_clean'])

In [None]:
matrix = vec.transform(train_df['text_clean']).toarray()
features = vec.get_feature_names()
matrix_df = pd.DataFrame(data=matrix, columns=features)


In [None]:
matrix_df.head(2)

In [None]:
matrix_df.shape

In [None]:
matrix_df['length']=train_df['length']
matrix_df['char_count']=train_df['char_count']
matrix_df['word_count']=train_df['word_count']
matrix_df['hashtag_count']=train_df['hashtag_count']
matrix_df['mention_count']=train_df['mention_count']
y=train_df['target']

## OCTOPUS-ML functions
[https://github.com/gershonc/octopus-ml](https://github.com/gershonc/octopus-ml)

In [None]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'num_leaves':32,
        'subsample': 1,
        #'colsample_bytree': 0.25,
        #'reg_alpha': 0,
        #'reg_lambda': 1,
        #'scale_pos_weight': 5,
        'n_estimators': 10000,
        'verbose': -1,
        'max_depth': -1,
        'seed':100, 
        'colsample_bytree':0.4,
        'force_col_wise': True


}
"""
    boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,
               importance_type='split', learning_rate=0.04, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=1500, n_jobs=-1, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0 
"""
metrics = oc.cv_adv(matrix_df,y,0.5,2000,shuffle=True,params=params)

In [None]:
oc.cv_plot(metrics['f1_weighted'],metrics['f1_macro'],metrics['f1_positive'],'Titanic Kaggle competition')

In [None]:
print(classification_report(metrics['y'], metrics['predictions_folds']))

In [None]:
oc.roc_curve_plot(metrics['y'], metrics['predictions_proba'])

In [None]:
oc.confusion_matrix_plot(metrics['y'], metrics['predictions_folds'])

In [None]:
feature_imp_list=oc.plot_imp(metrics['final_clf'],matrix_df,'LightGBM Mortality Kaggle',num=40)

In [None]:
oc.preds_distribution(metrics['y'], metrics['predictions_proba'], bins=40)

In [None]:
top_features=feature_imp_list.sort_values(by='Value', ascending=False).head(20)
top_features

In [None]:
list_for_correlations=top_features['Feature'].to_list()
list_for_correlations.append('target')
oc.correlations(matrix_df,list_for_correlations)

In [None]:
def Kaggle_submission(file_name,model,test_data,ids_list):
    #if TARGET in test_data.columns:
    #    test_data.drop([TARGET],axis=1,inplace=True)
    #test_pred=model.predict(test_data)[:,1]
    test_pred=model.predict(test_data)
    predictions = []
    predictions = oc.adjusted_classes(test_pred, 0.5)

    submit=pd.DataFrame()
    submit['id'] = ids_list
    submit['target'] = predictions
    submit.to_csv(file_name,index=False)
    return submit

In [None]:
test_df["text_clean"]=test_df['text']
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_emails(x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_urls(x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_rt(x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_special_chars(x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_accented_chars(x))

test_df["text_clean"] = test_df["text"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))
test_df['length']=test_df['text'].apply(len)

test_df.head()

#vec=TfidfVectorizer(max_features = 20000,ngram_range=(1,4))
#vec.fit(test_df['text_clean'])



matrix = vec.transform(test_df['text_clean']).toarray()
features = vec.get_feature_names()
matrix_df = pd.DataFrame(data=matrix, columns=features)

matrix_df['length']=test_df['length']
matrix_df['char_count']=test_df['char_count']
matrix_df['word_count']=test_df['word_count']
matrix_df['hashtag_count']=test_df['hashtag_count']
matrix_df['mention_count']=test_df['mention_count']

In [None]:
test_pred=metrics['final_clf'].predict(matrix_df)
predictions = []
#predictions = oc.adjusted_classes(test_pred, 0.5)

## BERT
thanks to: https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub



In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [None]:
#Credit: https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    if Dropout_num == 0:
        # Without Dropout
        out = Dense(1, activation='sigmoid')(clf_output)
    else:
        # With Dropout(Dropout_num), Dropout_num > 0
        x = Dropout(Dropout_num)(clf_output)
        out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Load BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
# Load tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
# Encode the text into tokens, masks, and segment flags
train_input = bert_encode(train_df.text_clean.values, tokenizer, max_len=160)
test_input = bert_encode(test_df.text_clean.values, tokenizer, max_len=160)
train_labels = train_df.target.values

In [None]:
random_state_split = 2
Dropout_num = 0
learning_rate = 6e-6
valid = 0.2
epochs_num = 3
batch_size_num = 16
target_corrected = False
target_big_corrected = False

# Build BERT model with my tuning
model_BERT = build_model(bert_layer, max_len=160)
model_BERT.summary()

In [None]:
checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True)

train_history = model_BERT.fit(
    train_input, train_labels,
    validation_split = valid,
    epochs = epochs_num, # recomended 3-5 epochs
    callbacks=[checkpoint],
    batch_size = batch_size_num
)

In [None]:
model_BERT.load_weights('model_BERT.h5')
test_pred_BERT = model_BERT.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')

In [None]:
train_pred_BERT = model_BERT.predict(train_input)
train_pred_BERT_int = train_pred_BERT.round().astype('int')

In [None]:
submit=pd.DataFrame()
submit['id'] = test_df['id'].tolist()
submit['target'] = test_pred_BERT_int

In [None]:
submit.to_csv('BERT_model_v3.csv',index=False)

In [None]:
submit.head(3)