# Introduction

**This notebook linear model part is based on the tutorial notebook**

https://www.kaggle.com/philculliton/nlp-getting-started-tutorial

**The sections for RidgeClassifier and XGBClassifier do not contribute to the final score but they are alternative models for this problem**

**The BERT section contributes to the final score**

In [None]:
import os
import numpy as np
import pandas as pd
import nltk
import string
import re
from collections import Counter
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, decomposition
import xgboost as xgb 
!pip install contractions
import IPython
import contractions
from datetime import datetime

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print(len(train_df), len(test_df))

# Text preprocessing for linear models

In [None]:
twt = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stop = nltk.corpus.stopwords.words("english") + list(string.punctuation)
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# print(stop)

def clean_text(df, col='text', normalize='lemmatize', stopwords=True, add_keyword=False, fill_empty='NULL', drop_dupe=False, shuffle=False):
    cleaned_text, pos, neg = [], [], []
    text_df = df[col]
    if drop_dupe:
        df = df.drop_duplicates(col)
    
    try: 
        targets = df.target
    except:
        targets = -np.ones(len(df))
        
    if add_keyword:
        df.keyword = df.keyword.str.replace("%20", " ").fillna("")
        text_df = df.text + " " + df.keyword
    
    for (target, text) in zip(targets, text_df):
#         print(text)
        text = text.lower().split(" ")
        text = [word for word in text if "http" not in word]
        text = [contractions.fix(word) for word in text]
        text = " ".join(text).lower()
        text = re.sub(r'\d+|#', '', text)
        text = twt.tokenize(text)
        if stopwords:
            text = [word for word in text if word not in stop]
        text = [word for word in text if word not in ["rt", "û_", "amp", "ûª", "ûªs", "ûò", "ûï", "ûó", "åè", "ìñ1", "\x89", "...", "..", "via"]]
        if normalize == 'lemmatize':
            text = [lemmatizer.lemmatize(word) for word in text]
        if normalize == 'stem':
            text = [stemmer.stem(word) for word in text]
            
        if target == 1: 
            pos.append(text)
        if target == 0: 
            neg.append(text)
        text = " ".join(text)
        cleaned_text.append(text)
#         print(text)
        
    df["clean_text"] = cleaned_text
    df.clean_text = df.clean_text.replace("%20", " ")
    if fill_empty != False:
        df.loc[df.clean_text.str.len() == 0, 'clean_text'] = fill_empty
    if shuffle:
        df = df.sample(frac=1)
    
    return pos, neg, df
        
pos_text, neg_text, train_df = clean_text(train_df, add_keyword=False, drop_dupe=True, shuffle=True)
_, _, test_df = clean_text(test_df, add_keyword=False)
pos_text = [item for sublist in pos_text for item in sublist]
neg_text = [item for sublist in neg_text for item in sublist]

In [None]:
pos_common = pd.DataFrame(Counter(pos_text).most_common(20))
neg_common = pd.DataFrame(Counter(neg_text).most_common(20))
pd.concat([pos_common, neg_common], axis=1)

In [None]:
display(train_df.loc[(train_df.clean_text == "NULL"), :])
display(test_df.loc[(test_df.clean_text == "NULL"), :])

In [None]:
display(train_df.sample(frac=1).head(10))
display(test_df.sample(frac=1).head(10))

# Count and Vectorize approach (1-gram)

In [None]:
feature_col = "clean_text"

count_vectorizer = feature_extraction.text.CountVectorizer()
count_vectorizer_sw = feature_extraction.text.CountVectorizer()
tfidf = feature_extraction.text.TfidfVectorizer()
LSA = decomposition.TruncatedSVD(n_components=100)

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:50])
example_train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col][0:50])
example_tfidf = tfidf.fit_transform(train_df[feature_col][0:50])
example_tfidf_lsa = LSA.fit_transform(example_tfidf)

In [None]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('No cleaning')
print(example_train_vectors[0].todense().shape)
# print(example_train_vectors[0].todense())
print('Cleaned')
print(example_train_vectors_sw[0].todense().shape)
# print(example_train_vectors_sw[0].todense())
print('TF-IDF cleaned')
print(example_tfidf[0].todense().shape)
# print(example_tfidf[0].todense())
print('TF-IDF + LSA cleaned')
print(example_tfidf_lsa[0].shape)
# print(example_tfidf_lsa[0])

In [None]:
train_vectors = count_vectorizer.fit_transform(train_df[feature_col])
train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col])
train_tfidf = tfidf.fit_transform(train_df[feature_col])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df[feature_col])
test_vectors_sw = count_vectorizer_sw.transform(test_df[feature_col])
test_tfidf = tfidf.transform(test_df[feature_col])

In [None]:
train_tfidf_lsa = LSA.fit_transform(train_tfidf)
test_tfidf_lsa = LSA.transform(test_tfidf)

# **Linear Model: Ridge Classifier**

In [None]:
# clf = linear_model.RidgeClassifier(class_weight='balanced')
# ridge_params = {
#     "alpha": np.linspace(0, 2, 100),
#     "tol": np.linspace(1e-5, 1e-1, 2000)
# }
# ridge_rscv = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=100)
# ridge_rscv_lsa = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, n_iter=100)

In [None]:
# search = ridge_rscv.fit(train_tfidf, train_df["target"])
# search_lsa = ridge_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])
# IPython.display.clear_output()
# print("Best RidgeClassifier TF-IDF")
# print(search.best_score_)
# print(search.best_params_)
# print("Best RidgeClassifier TF-IDF LSA")
# print(search_lsa.best_score_)
# print(search_lsa.best_params_)

In [None]:
# scores_tfidf = model_selection.cross_validate(clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# scores_tfidf_lsa = model_selection.cross_validate(clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

In [None]:
# scores_tfidf = model_selection.cross_validate(ridge_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# scores_tfidf_lsa = model_selection.cross_validate(ridge_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("Best RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('Best RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("Best RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('Best RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('Best RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('Best RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

# **GBDT: XGB Classifier**
Turns out not as good as ridge

In [None]:
# xgb_clf = xgb.XGBClassifier(random_state=765, tree_method='gpu_hist', predictor='gpu_predictor')
# xgb_params = {
#     "max_depth": [i for i in range(4, 14)],
#     "min_child_weight": np.linspace(0.25, 0.45, 100),
#     "gamma": np.linspace(0, 0.015, 1000),
#     "learning_rate": np.linspace(0.2, 0.5, 100),
# }
# xgb_rscv = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)
# xgb_rscv_lsa = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)

In [None]:
# search = xgb_rscv.fit(train_tfidf, train_df["target"])
# search_lsa = xgb_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])
# IPython.display.clear_output()
# print("Best XGBClassifier TF-IDF")
# print(search.best_score_)
# print(search.best_params_)
# print("Best XGBClassifier TF-IDF LSA")
# print(search_lsa.best_score_)
# print(search_lsa.best_params_)

In [None]:
# scores_tfidf = model_selection.cross_validate(xgb_clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

In [None]:
# scores_tfidf = model_selection.cross_validate(xgb_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("Best XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('Best XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("Best XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('Best XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('Best XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('Best XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

# **MAIN CONTENT: BERT**

**Note that this is not the best version, fine tunings and validation may help obtain a better score**

In [None]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Adding additional tokens for masking URLs and usernames in tweets

In [None]:
bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[LINK]', '[USER]']})
bert_tokenizer

As BERT is able to read complete passages and learn from the context, too much text preprocessing may not be beneficial.

Some minor preprocessing with URLs, @usernames, and #hashtag, as they may be tokenized weirdly and the token make no sense

*Note: The BERT model still did pretty good without the above processing*

Then tokenize the data

In [None]:
def bert_tokenize(df, tokenizer=bert_tokenizer, max_seq_len = 100):
    input_sequences = []
    # The attention mask is an optional argument used when batching sequences together.
    # The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
    attention_masks = []
    bert_text = []
    
    # some very minor text processing, try to keep the text as close as original
    for i, text in enumerate(df['text']):
#         print(i, text)
        text = text.replace("\n", " ").split(" ")
        text = [word if "http" not in word else "[LINK]" for word in text]
        text = [word if "@" not in word else "[USER]" for word in text]
        text = " ".join(text)
        text = re.sub(r'#', '', text)
        bert_text.append(text)
        
#         print(i, text)
        sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True)
        input_ids = sequence_dict['input_ids']
        att_mask = sequence_dict['attention_mask']
#         print(i, tokenizer.tokenize(text))
        input_sequences.append(input_ids)
        attention_masks.append(att_mask)
    
    df['bert_text'] = bert_text
    return input_sequences, attention_masks, df

train_X, train_att, train_df = bert_tokenize(train_df)
train_y = train_df['target'].values
test_X, test_att, test_df = bert_tokenize(test_df)

In [None]:
# Checking the tokenized format
print(train_X[0])
print(train_att[0])
print(test_X[0])
print(test_att[0])

Forming dataset

In [None]:
train_X = torch.tensor(train_X)
train_y = torch.tensor(train_y)
train_att = torch.tensor(train_att)
test_X = torch.tensor(test_X)
test_att = torch.tensor(test_att)

In [None]:
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_X, train_att, train_y)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_X, test_att)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**Pretrained model from bert-base-uncased**

resize_token_embeddings is required as we have added new special tokens

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.resize_token_embeddings(len(bert_tokenizer))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
IPython.display.clear_output()

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fct = torch.nn.NLLLoss()

Define train and test functions

In [None]:
def train(epoch):
    t0 = datetime.now()
    model.train()
    for i, batch in enumerate(train_dataloader, start=1):
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        model.zero_grad()  
        
        logits = model(inputs, attention_mask=att_masks)
        outputs = F.log_softmax(logits[0], dim=1)
        
        loss = loss_fct(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        
        if i % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0%})] - Elapsed: {}  |  Loss: {:.4f}'.format(
                epoch, i * len(inputs), len(train_dataloader.dataset),
                 i / len(train_dataloader), datetime.now() - t0, loss.item()
            ))

In [None]:
def test():
    t0 = datetime.now()
    model.eval()
    test_loss, test_acc = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, att_masks, labels = batch
        with torch.no_grad():
            logits = model(inputs, attention_mask=att_masks)
            outputs = F.log_softmax(logits[0], dim=1)
            
            loss = loss_fct(outputs.view(-1, 2), labels.view(-1))

        test_loss += loss.item()
        outputs = outputs.detach().cpu().numpy()

        pred = np.argmax(outputs, axis=1)
        labels = labels.cpu().numpy()
        
        test_acc += accuracy_score(pred, labels)

    test_loss /= len(test_dataloader)
    test_acc /= len(test_dataloader)
    print('\nTest set: Loss: {:.4f}, Accuracy: {:.1%} - Elapsed: {}\n'.format(
        test_loss, test_acc, datetime.now() - t0
    ))

In [None]:
nb_epoch = 2
for epoch in range(1, nb_epoch+1):
    train(epoch)
#     test()

**Generating predictions for test data**

In [None]:
def predict(text):
    # pre-process text
    input_ = torch.tensor(bert_tokenizer.encode(text)).unsqueeze(0).to(device)
    logits = model.eval()(input_ids=input_)[0]
    pred = F.softmax(logits, dim=1)[0]
    return pred

In [None]:
predictions = []
for text in test_df.text:
    prob = predict(text)
    pred = np.argmax(prob.cpu().detach().numpy())
    predictions.append(pred)

# Submission

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
# train_prediction = ridge_rscv.best_estimator_.predict(train_tfidf)
# train_df['pred_target'] = train_prediction

# ridge with rscv
# sample_submission["target"] = ridge_rscv.best_estimator_.predict(test_tfidf)

# bert
sample_submission["target"] = predictions

In [None]:
# clean_text_wc = train_df.clean_text.str.count(' ').add(1)
# short_text_incorrect = train_df.loc[(clean_text_wc < 5) & (train_df.target != train_df.pred_target), :]
# (short_text_incorrect.target == 1).sum(), (short_text_incorrect.target == 0).sum()

In [None]:
# display(sample_submission.head(30))
# display(test_df['text'].head(30))
pd.merge(sample_submission, test_df, on=['id']).sample(frac=1).head(10)

In [None]:
sample_submission.to_csv("submission.csv", index=False)