In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Libraries

import numpy as np # for linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import KFold,train_test_split,cross_validate, ShuffleSplit
import tensorflow_addons as tfa
from keras import backend as K
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
import torch as torch
import transformers as ppb
import spacy


### **Outline** ###

1. Preprocessing Text
2. Transfer Learning From the Pretrained BERT Model
3. Recurrent Network with the Long Short-Term Memory Cells
4. Classifier for Keywords
5. Ensembler Learner
6. Predicting Test Data

In [None]:
# Importing Dataset
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
df_train.info()

In [None]:
df_train.target.sum() / df_train.target.count()

In [None]:
plt.style.use('fivethirtyeight')
sns.displot(data=df_train, x="target")

### **Clean Keyword** ###

In [None]:
def format_keyword(df):
    df["keyword"] = df["keyword"].fillna(".")
    df["keyword"] = df.keyword.str.replace("%20"," ")

In [None]:
format_keyword(df_train)

In [None]:
df_train.loc[df_train.target==0]["keyword"].value_counts()

In [None]:
format_keyword(df_test)

In [None]:
df_test.info()

### **Clean Text** ###

In [None]:
df_count = df_train.text.str.split().str.len()
max(df_count)

In [None]:
import re
def process_text(text):
    text=text.replace("\n","")
    text = re.sub(r'@\S+','',text)
    text = re.sub(r'#\S+','',text) 
    text = re.sub(r'https?://\S+|www\.\S+|http?://\S+','',text) 
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)  
    text = re.sub(r'[0-9]', '', text)
    text = re.sub("["
                           u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',text)
    text = text.lower()
    text = text.split()
    res = ""
    for token in text:
        if not token==" ":
            res += token +" "
    
    return res

In [None]:
sample_doc = "!!! @user @twi #topic \n\nHello,   UTC20150805 World http://t.com"
process_text(sample_doc)

In [None]:
df_train["text"] = df_train.text.transform(lambda x: process_text(x))
df_test["text"] = df_test.text.transform(lambda x: process_text(x))

### **Duplicate Tweet With Ambiguous Target** ###

In [None]:
df_train["appears"]=df_train.groupby("text").text.transform("count")

In [None]:
df_train["target_std"]=df_train.groupby("text").target.transform(np.std)
df_train["target_mean"]=df_train.groupby("text").target.transform(np.mean)

In [None]:
duplicate_ids = df_train.loc[df_train.target_std>0].sort_values(by=["appears","text"],ascending=False).index

In [None]:
duplicate_ids

In [None]:
df_train = df_train.drop(index = duplicate_ids)

In [None]:
df_train = df_train.drop_duplicates(subset=["text"])

In [None]:
df_train.reset_index(drop=True,inplace=True)
df_train

In [None]:
nlp = spacy.load("en_core_web_lg")
keyword_train = np.array([nlp(text).vector for text in df_train.keyword])
keyword_test = np.array([nlp(text).vector for text in df_test.keyword])

In [None]:
def nlp_vectors(text):
    res = []
    doc = nlp(text)
    for token in doc:
        if not token.is_space:
            res.append(token.vector)
    return res

def build_nlp_vectors(df_text):
    spacy_vectors = ([nlp_vectors(text) for text in df_text])
    max_length = 0;
    for vector in spacy_vectors:
        max_length = max(max_length, len(vector))
    print(f"Maximum Length:{ max_length}")
    for i in range(len(spacy_vectors)):
        while(len(spacy_vectors[i]) <max_length):
            spacy_vectors[i].append([0]*300)
    spacy_vectors = np.array(spacy_vectors)
    print(f"Shape of spacy vector:{spacy_vectors.shape}")
    return spacy_vectors

In [None]:
nlp_train = build_nlp_vectors(df_train.text)

In [None]:
tokenizer = ppb.DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = ppb.DistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
def process_data(df_text):
    tokens = df_text.apply(lambda text: tokenizer.encode(text,add_special_tokens=True))
    max_len = 0;
    i = 0;
    for token in tokens.values:
        max_len = max(max_len,len(token))
    print(f"Max Length: {max_len}")
    
    padded = np.array([i+[0]*(max_len-len(i)) for i in tokens.values])
    attention_mask = np.where(padded !=0, 1,0)
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)
    with torch.no_grad():
        last_hidden_states = bert_model(input_ids,attention_mask=attention_mask)
    X = last_hidden_states[0][:,0,:].numpy()
    print(X.shape)
    return X

In [None]:
X_train = process_data(df_train.text)

In [None]:
y_train = df_train.target

In [None]:
X_tr, X_val, nlp_tr, nlp_val, kw_tr, kw_val, y_tr, y_val = train_test_split(X_train,nlp_train, keyword_train, y_train, test_size=0.25, train_size=0.75,shuffle=True)

### **Transfer Learning Model** ###

In [None]:
def build_nn():
    model = tf.keras.Sequential()
    model.add(layers.Input(shape=(768,)))
    model.add(layers.Dense(128,activation='tanh'))
    model.add(layers.Dropout(0.6))
    model.add(layers.Dense(32,activation='tanh'))
    model.add(layers.Dropout(0.6))
    model.add(layers.Dense(8,activation='tanh'))
    model.add(layers.Dense(1,activation='sigmoid'))
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(1e-4),
                      metrics=['accuracy'])
    return model

### **kFold Cross Validation** ###

In [None]:
fold = 4
def plot_history(history):
    plt.figure(figsize=(4*fold,4*2))
    for i in range(fold):
        plt.subplot(2,fold,i+1)
        plt.plot(history_by_fold[i].history["loss"])
        plt.plot(history_by_fold[i].history["val_loss"])
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend(["Train","Validation"])

        plt.subplot(2,fold,fold+i+1)
        plt.plot(history_by_fold[i].history["accuracy"])
        plt.plot(history_by_fold[i].history["val_accuracy"])
        plt.xlabel("Epochs")
        plt.ylabel("Accuracy")
        plt.legend(["Train","Validation"])

In [None]:
kfold = KFold(n_splits=4, shuffle=True, random_state=1)

In [None]:
def eval_f1_score(X_val, y_val, model):
    pred_val = (model.predict(X_val)>0.5)
    f1 = f1_score(y_val,pred_val)
    return f1

In [None]:
EPOCHS = 100
BATCH_SIZE = 64

In [None]:
fold = 0
history_by_fold = []
cv_results = []
for train,val in kfold.split(X_train,y_train):
    nn_model = build_nn()
    history = nn_model.fit(X_train[train],y_train[train],
                          validation_data=(X_train[val],y_train[val]),
                          epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)
    scores = nn_model.evaluate(X_train[val],y_train[val],verbose=0)
    print(f"-- Fold {fold} -- ")
    print(f"{nn_model.metrics_names[0]}: {scores[0]}")
    print(f"{nn_model.metrics_names[1]}: {scores[1]}")
    print(f"F1 Score: {eval_f1_score(X_train[val],y_train[val],nn_model)}")

    cv_results.append(scores[1])
    history_by_fold.append(history)
    fold+=1
print(f"{np.mean(cv_results)} +\- {np.std(cv_results)}")
plot_history(history)

### **Training** ###

In [None]:
nn_model = build_nn()
history = nn_model.fit(X_tr,y_tr, validation_data=(X_val,y_val),
                      epochs=EPOCHS, batch_size=BATCH_SIZE,verbose=0)
scores= nn_model.evaluate(X_val,y_val,verbose=0)
print(f"Accuracy: {scores[1]}")
print(f"F1 Score: {eval_f1_score(X_val,y_val,nn_model)}")

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["Train","Validation"])
plt.xlabel("Epochs")
plt.ylabel("Loss")

plt.subplot(1,2,2)
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.legend(["Train","Validation"])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

### **Recurrent Network with Long Short-Term Memory Cells** ###

In [None]:
def build_LSTM():
    lstm_model = tf.keras.Sequential()
    lstm_model.add(layers.Input(shape=(None,300)))
    lstm_model.add(layers.LSTM(16)) 
    lstm_model.add(layers.Dense(8, activation="tanh"))
    lstm_model.add(layers.Dense(8, activation="tanh"))
    lstm_model.add(layers.Dense(1,activation="sigmoid"))

    lstm_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(5e-5),
                      metrics=['accuracy'])
    return lstm_model


In [None]:
EPOCHS =  30;
BATCH_SIZE = 64;

In [None]:
kfold = KFold(n_splits=4, shuffle=True, random_state=1)

In [None]:
fold = 0
history_by_fold = []
cv_results = []
for train, val in kfold.split(nlp_train,y_train):
    lstm_model = build_LSTM()
    history = lstm_model.fit(nlp_train[train],y_train[train],
                            validation_data=(nlp_train[val],y_train[val]),
                            epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=0)
    scores = lstm_model.evaluate(nlp_train[val],y_train[val],verbose=0)
    
    #df_test[f"Fold{fold}"] = lstm_model.predict(X_test)
    #df_train[f"Fold{fold}"] = lstm_model.predict(X_train)
    print(f"-- Fold{fold} --")
    print(f"{lstm_model.metrics_names[0]}: {scores[0]}")
    print(f"{lstm_model.metrics_names[1]}: {scores[1]}")
    print(f"F1 Score: {eval_f1_score(nlp_train[val],y_train[val],lstm_model)}")

    cv_results.append(scores[1])
    history_by_fold.append(history)
    fold+=1
print(f"{np.mean(cv_results)} +\- {np.std(cv_results)}")

In [None]:
plt.figure(figsize=(4*fold,4*2))
for i in range(fold):
    plt.subplot(2,fold,i+1)
    plt.plot(history_by_fold[i].history["loss"])
    plt.plot(history_by_fold[i].history["val_loss"])
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(["Train","Validation"])

    plt.subplot(2,fold,fold+i+1)
    plt.plot(history_by_fold[i].history["accuracy"])
    plt.plot(history_by_fold[i].history["val_accuracy"])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(["Train","Validation"])

In [None]:
lstm_model = build_LSTM()
history = lstm_model.fit(nlp_tr,y_tr,validation_data=(nlp_val,y_val), epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
valid_predict  = (lstm_model.predict(nlp_val) > 0.5)
f1 = f1_score(y_val, valid_predict)
print(f" F1 Score: {f1}")

In [None]:

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["Train","Validation"])
plt.xlabel("Epochs")
plt.ylabel("Loss")

plt.subplot(1,2,2)
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.legend(["Train","Validation"])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

### **Classification With Keywords** ###

In [None]:
lr_keywords = LogisticRegression(max_iter=500)
lr_keywords.fit(kw_tr,y_tr)
val_pred = lr_keywords.predict(kw_val)
print(f"Accurcay: {accuracy_score(y_val, val_pred)}")
print(f"F1 score: {f1_score(y_val,val_pred)}")

### **Ensembler Learning** ###

In [None]:
nn_tr_predict = nn_model.predict(X_tr)
kw_tr_predict = lr_keywords.predict_proba(kw_tr)[:,1]
lstm_tr_predict = lstm_model.predict(nlp_tr)

nn_val_predict = nn_model.predict(X_val)
kw_val_predict = lr_keywords.predict_proba(kw_val)[:,1]
lstm_val_predict = lstm_model.predict(nlp_val)

kw_tr_predict = kw_tr_predict.reshape((kw_tr_predict.shape[0],1))
kw_val_predict = kw_val_predict.reshape((kw_val_predict.shape[0],1))


concat_tr = np.concatenate((nn_tr_predict, kw_tr_predict, lstm_tr_predict), axis=1)
concat_val = np.concatenate((nn_val_predict, kw_val_predict, lstm_val_predict), axis=1)

In [None]:
lr = LogisticRegression()
lr.fit(concat_tr,y_tr)
val_pred = lr.predict(concat_val)
print(f"Accurcay: {accuracy_score(y_val, val_pred)}")
print(f"F1 score: {f1_score(y_val,val_pred)}")

## **Predicting Text** ##

### **Processing Test Data** ###

In [None]:
X_test = process_data(df_test.text)

In [None]:
nlp_test = build_nlp_vectors(df_test.text)

### **Model Prediction** ###

In [None]:
df_test["nn_predict"]= nn_model.predict(X_test)
df_test["lstm_predict"]= lstm_model.predict(nlp_test)
df_test["keyword_predict"] = lr_keywords.predict_proba(keyword_test)[:,1]
features = ["nn_predict","keyword_predict","lstm_predict"]

test_features = df_test[features]
predict = lr.predict(test_features)

In [None]:
output = pd.DataFrame({"id":df_test.id, "target":predict})
output.to_csv("submission.csv",index=False)
output