In [None]:
# !pip install -U spacy[cuda101]
!pip install transformers

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import re
import json
import time
import copy
import datetime
import pandas as pd
import numpy as np
from statistics import mean
from collections import defaultdict
from tqdm import tqdm_notebook
from uuid import uuid4

## Scipy
from scipy.sparse import csr_matrix, hstack

## sklearn
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Keras Modules
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.metrics import Precision, Recall
from keras.layers import Dense, LSTM, GRU, Bidirectional, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.regularizers import l1, l2, l1_l2
from keras import backend as K

import tensorflow as tf

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset

## PyTorch Transformer
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW, get_linear_schedule_with_warmup

import spacy

# spacy.prefer_gpu()

import nltk
from nltk.stem import WordNetLemmatizer as wn_lemmatizer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

nltk.download('stopwords')
stop_words = stopwords.words('english')
# !python -m spacy download en
nlp = spacy.load('en_core_web_sm')

This notebook compared machine learning models and deep learning models on their abilities to classify texts. 

In [None]:
## Check if Cuda is Available
print(torch.cuda.is_available())

In [None]:
RANDOM_STATE = 1234

tqdm_notebook().pandas()

# 0. Utility Functions

In [None]:
def stratified_cv(classifier, X_train, y_train, **kwargs):
    """
    Calculates the stratefied cross-validation scores with preserving the percentage of samples for each class in each fold.
    """
    skf = StratifiedKFold(**kwargs)
    total_accuracy = []
    total_precision = []
    total_recall = []
    total_f1 = []
    for train_index, val_index in skf.split(X_train, y_train):
        current_X_train = X_train[train_index]
        current_y_train = y_train.iloc[train_index]
        current_X_val = X_train[val_index]
        current_y_val = y_train.iloc[val_index]

        clf = copy.deepcopy(classifier)
        clf.fit(current_X_train, current_y_train)

        current_predictions = clf.predict(current_X_val)
        total_accuracy.append(accuracy_score(current_y_val, current_predictions))
        total_precision.append(precision_score(current_y_val, current_predictions))
        total_recall.append(recall_score(current_y_val, current_predictions))
        total_f1.append(f1_score(current_y_val, current_predictions))
        
    ave_accuracy = mean(total_accuracy)
    ave_precision = mean(total_precision)
    ave_recall = mean(total_recall)
    ave_f1 = mean(total_f1)
    
    print("Average Accuracy: {:.4f}".format(ave_accuracy))
    print("Average Precision: {:.4f}".format(ave_precision))
    print("Average Recall: {:.4f}".format(ave_recall))
    print("Average F1: {:.4f}".format(ave_f1))

In [None]:
def clean_text(text):
    # remove special characters, url links and numbers from the original text.
    
    # Special charactes
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    # remove url link
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # remove html tag
    text = re.sub(r'<.*?>', '', text)
    
    # remove numbers
    text = re.sub(r'[\d]+', ' ', text)
    
    return text

def preprocess(text, allowed_tags=["N", "J", "R", "V"]):
    # lemmatization and remove stop words 
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['N'] = wn.NOUN
    tag_map['J'] = wn.ADJ
    tag_map['R'] = wn.ADV
    tag_map['V'] = wn.VERB
#     tag_map['D'] = wn.DET
    
    text = text.lower()

    tokens = [token for token in wordpunct_tokenize(text)]

    lemmatizer = wn_lemmatizer()

    lemmas = [lemmatizer.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens) if tag[0] in allowed_tags]

    lemmas = [lemma for lemma in lemmas if lemma not in stop_words and lemma.isalpha()]

    return ' '.join(lemmas)

In [None]:
def array_equal(array_1, array_2):
    """
    custom function to compare two 1D np arrays.
    """
    if array_1.shape != array_2.shape:
        return False
    
    for i in range(len(array_1)):
        if (array_1[i] is array_2[i]) or (array_1[i] == array_2[i]):
            continue 
        return False
    return True
    

# 1. Data Cleaning

In [None]:
data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

data.head()

In [None]:
# assert the keyword sets are equal in train and test data.
assert array_equal(data["keyword"].unique(), test["keyword"].unique()), "Keywords in train and test data are not equal"

In [None]:
test.head(20)

In [None]:
# remove controversial tweets
unique_targets = data.groupby('text').agg(unique_target=('target', pd.Series.nunique))
controversial_tweets = unique_targets[unique_targets['unique_target'] > 1].index

data = data[~data['text'].isin(controversial_tweets)]

# remove duplicates rows
data = data.drop_duplicates(subset='text', keep='first')

# remove special characters, url, and html tags
data['text'] = data['text'].apply(clean_text) 
test['text'] = test['text'].apply(clean_text)

# convert the keywords to feature maps
data['value'] = 1
data['keyword'] = data['keyword'].fillna("nan")

test['value'] = 1
test['keyword'] = test['keyword'].fillna("nan")

data = pd.pivot_table(data, values="value", index=["id", "text", "target"], columns=["keyword"]).fillna(0).reset_index()
test = pd.pivot_table(test, values="value", index=["id", "text"], columns=["keyword"]).fillna(0).reset_index()

In [None]:
data.shape

In [None]:
test.shape

In [None]:
#split data into train and test set
TRAIN_VAL = 0.8
train = data.sample(frac=TRAIN_VAL, random_state=RANDOM_STATE)
val = data.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [None]:
train_pruned = train.copy(deep=True)
train_pruned['text'] = train_pruned['text'].progress_apply(preprocess)

val_pruned = val.copy(deep=True)
val_pruned['text'] = val_pruned['text'].progress_apply(preprocess)

test_pruned = test.copy(deep=True)
test_pruned['text'] = test_pruned['text'].progress_apply(preprocess)

## 1.1 Tf-idf matrix

In [None]:
all_text = pd.concat([train_pruned.drop(['target'], axis=1), val_pruned.drop(['target'], axis=1), test_pruned], ignore_index=True)

In [None]:
# exploring uni-gram and bi-gram features 
cv = CountVectorizer(ngram_range=(1,2))
tfidf_transformer = TfidfTransformer()
counts = cv.fit_transform(all_text['text'])
tfidf = tfidf_transformer.fit_transform(counts)

In [None]:
augmented = hstack([tfidf, csr_matrix(all_text.iloc[:, 3:])]).tocsr()

In [None]:
train_size = train_pruned.shape[0]
val_size = val_pruned.shape[0]

X_train_tfidf = tfidf[:train_size,:]
X_train_augmented = augmented[:train_size,:]

X_val_tfidf = tfidf[train_size: val_size + train_size,:]
X_val_augmented = augmented[train_size: val_size + train_size,:]

X_test_tfidf = tfidf[val_size + train_size:,:]
X_test_augmented = augmented[val_size + train_size:,:]

In [None]:
y_train_tfidf = train_pruned['target']
y_val_tfidf = val_pruned['target']

## 1.2 Keyword feature map

In [None]:
X_train_keyword = csr_matrix(train_pruned.iloc[:, 3:]).tocsr()
X_val_keyword = csr_matrix(val_pruned.iloc[:, 3:]).tocsr()
X_test_keyword = csr_matrix(test_pruned.iloc[:, 2:]).tocsr()

# 2. Naive Bayse

In [None]:
mb_clf = MB()
mb_clf.fit(X_train_tfidf, y_train_tfidf)

pred = mb_clf.predict(X_train_tfidf)
print(classification_report(y_train_tfidf, pred))

stratified_cv(mb_clf, X_train_tfidf, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
mb_clf_augmented = MB()
mb_clf_augmented.fit(X_train_augmented, y_train_tfidf)

pred = mb_clf_augmented.predict(X_train_augmented)
print(classification_report(y_train_tfidf,pred))

stratified_cv(mb_clf_augmented, X_train_augmented, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
mb_clf_keyword = MB()
mb_clf_keyword.fit(X_train_keyword, y_train_tfidf)

pred = mb_clf_keyword.predict(X_train_keyword)
print(classification_report(y_train_tfidf,pred))

stratified_cv(mb_clf_keyword, X_train_keyword, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
# TODO: try sub-sampling features to prevent overfitting. 

In [None]:
# test_pruned['target'] = mb_clf_augmented.predict(X_test_augmented)
# test_pruned[['id', 'target']].to_csv("submission_2.csv", index=False)

# 3. SVM

In [None]:
svm_clf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

svm_clf.fit(X_train_tfidf, y_train_tfidf)
pred = svm_clf.predict(X_train_tfidf)
print(classification_report(y_train_tfidf,pred))

stratified_cv(svm_clf, X_train_tfidf, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
svm_clf_augmented = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

svm_clf_augmented.fit(X_train_augmented, y_train_tfidf)
pred = svm_clf_augmented.predict(X_train_augmented)
print(classification_report(y_train_tfidf,pred))

stratified_cv(svm_clf_augmented, X_train_augmented, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
svm_clf_keyword = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

svm_clf_keyword.fit(X_train_keyword, y_train_tfidf)
pred = svm_clf_keyword.predict(X_train_keyword)
print(classification_report(y_train_tfidf,pred))

stratified_cv(svm_clf_keyword, X_train_keyword, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
# test_pruned['target'] = svm_clf_augmented.predict(X_test_augmented)
# test_pruned[['id', 'target']].to_csv("submission_svm.csv", index=False)

In [None]:
svm_train_preds = svm_clf_keyword.predict(X_train_keyword)
svm_val_preds = svm_clf_keyword.predict(X_val_keyword)
svm_test_preds = svm_clf_keyword.predict(X_test_keyword)

# 4. Random Forest

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_STATE)
rf_clf.fit(X_train_tfidf, y_train_tfidf)
rf_preds = rf_clf.predict(X_train_tfidf)

print(classification_report(y_train_tfidf, rf_preds))

stratified_cv(rf_clf, X_train_tfidf, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
rf_clf_augmented = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_STATE)
rf_clf_augmented.fit(X_train_augmented, y_train_tfidf)
rf_preds_augmented = rf_clf_augmented.predict(X_train_augmented)

print(classification_report(y_train_tfidf, rf_preds))

stratified_cv(rf_clf_augmented, X_train_augmented, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
rf_train_preds = rf_clf_augmented.predict(X_train_augmented)
rf_val_preds = rf_clf_augmented.predict(X_val_augmented)
rf_test_preds = rf_clf_augmented.predict(X_test_augmented)

# 5. Logistic Regression

In [None]:
lr_clf = LogisticRegression(fit_intercept=True, 
                              random_state=RANDOM_STATE, 
                              solver = 'liblinear',
                              penalty = 'l1',
                              max_iter = 200)
lr_clf.fit(X_train_tfidf, y_train_tfidf)
lr_preds = lr_clf.predict(X_train_tfidf)

print(classification_report(y_train_tfidf, lr_preds))

stratified_cv(lr_clf, X_train_tfidf, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

In [None]:
lr_clf = LogisticRegression(fit_intercept=True, 
                              random_state=RANDOM_STATE, 
                              solver = 'liblinear',
                              penalty = 'l1',
                              max_iter = 200)
lr_clf.fit(X_train_augmented, y_train_tfidf)
lr_preds = lr_clf.predict(X_train_augmented)

print(classification_report(y_train_tfidf, lr_preds))

stratified_cv(lr_clf, X_train_augmented, y_train_tfidf, random_state=RANDOM_STATE, shuffle=True)

# 4. TextCNN

In [None]:
# prevent tensorflow from using GPU to save memory for roBERTa
tf.config.set_visible_devices([], 'GPU')
visible_devices = tf.config.get_visible_devices()

In [None]:
train_corpus = train_pruned['text'].apply(lambda x: x.split()).to_list()
val_corpus = val_pruned['text'].apply(lambda x: x.split()).to_list()
test_corpus = test_pruned['text'].apply(lambda x: x.split()).to_list()

In [None]:
corpus = train_corpus + val_corpus + test_corpus

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

train_sequences = tokenizer.texts_to_sequences(train_corpus)
val_sequences = tokenizer.texts_to_sequences(val_corpus)
test_sequences = tokenizer.texts_to_sequences(test_corpus)

# the dictionary of word occurrences.
word_index = tokenizer.word_index

train_max_length = max([len(x) for x in train_sequences])
val_max_length = max([len(x) for x in val_sequences])
test_max_length = max([len(x) for x in test_sequences])

max_length = max(train_max_length, val_max_length, test_max_length)

X_train_pad = pad_sequences(train_sequences, maxlen=max_length, padding="post")
y_train = train_pruned['target']

X_val_pad = pad_sequences(val_sequences, maxlen=max_length, padding="post")
y_val = val_pruned['target']

X_test_pad = pad_sequences(test_sequences, maxlen=max_length, padding="post")

vocab = np.array(list(tokenizer.word_index.keys()))
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X_train_pad.shape

In [None]:
EMBEDDING_DIM = 30

In [None]:
def ConvNet(max_sequence_length, num_words, embedding_dim, labels_index):
 
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            input_length=max_sequence_length)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    print()
    convs = []
    filter_sizes = [2, 3, 4, 5, 6]
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=30, 
                        kernel_size=filter_size, 
                        activation='relu')(embedded_sequences)
        l_conv = Dropout(0.2)(l_conv)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs, axis=1)
#     x = Dropout(0.2)(l_merge)  
    x = Dense(30, activation='relu')(l_merge)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid', kernel_regularizer=l1_l2(0.01, 0.01))(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc', Precision(), Recall()])
    model.summary()
    return model

In [None]:
cnn_model = ConvNet(max_length, vocab_size, EMBEDDING_DIM, 1)

In [None]:
hist = cnn_model.fit(X_train_pad, 
                 y_train, 
                 epochs=10, 
                 batch_size=64, 
                 validation_data=(X_val_pad, y_val), 
                 verbose=2) 

In [None]:
cnn_test_preds = cnn_model.predict(X_test_pad)
cnn_test_preds_binary = list(map(lambda x: 1 if x >= 0.5 else 0, cnn_test_preds))

cnn_train_preds = cnn_model.predict(X_train_pad)
cnn_train_preds_binary = list(map(lambda x: 1 if x >= 0.5 else 0, cnn_train_preds))

cnn_val_preds = cnn_model.predict(X_val_pad)
cnn_val_preds_binary = list(map(lambda x: 1 if x >= 0.5 else 0, cnn_val_preds))

In [None]:
# test['target'] = cnn_test_preds_binary
# test[['id', 'target']].to_csv("submission_cnn.csv", index=False)

# 4. Pretrained Models

The model is trained on a combination of 5 english datasets, totaling over 160 GB. 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU is available: {}".format(torch.cuda.is_available()))

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
model = RobertaForSequenceClassification.from_pretrained('roberta-base').to(device)

In [None]:
def prepare_features(data_set, labels=None, max_seq_length = 200, 
                     zero_pad = True, include_special_tokens = True): 
    
    ## Tokenzine Input
    input_ids = []
    attention_masks = []
    
    for sent in data_set:
        encoded_dict = tokenizer.encode_plus(
                    sent,                      # Sentence to encode.
                    add_special_tokens = include_special_tokens, # Add '[CLS]' and '[SEP]'
                    max_length = max_seq_length,           # Max length according to our text data.
                    pad_to_max_length = zero_pad, # Pad & truncate all sentences.
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
               )
    
        # Add the encoded sentence to the id list. 
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
    
    # convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labels is not None: 
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    else: 
        return input_ids, attention_masks

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 1e-05
EPSILON = 1e-8
MAX_EPOCHS = 5

In [None]:
# split the train and validation set. 
train_val = 0.8
train = data.sample(frac=train_val, random_state=RANDOM_STATE)
val = data.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

# generate the input sequences.
train_input_ids, train_attention_masks, train_labels = prepare_features(
    train['text'], train['target'])
val_input_ids, val_attention_masks, val_labels = prepare_features(
    val['text'], val['target'])
test_input_ids, test_attention_masks = prepare_features(
    test['text'])

# Convert the data into torch tensor set.
training_set = TensorDataset(train_input_ids, train_attention_masks, train_labels)
validation_set = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_set = TensorDataset(test_input_ids, test_attention_masks)

# Prepare the train & validation data loader.
loading_params = {'batch_size': BATCH_SIZE,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

loading_params_no_shuffle = {'batch_size': BATCH_SIZE,
          'shuffle': False,
          'drop_last': False,
          'num_workers': 1}

training_loader = DataLoader(training_set, **loading_params)
validation_loader = DataLoader(validation_set, **loading_params)

training_loader_no_shuffle = DataLoader(training_set, **loading_params_no_shuffle)
validation_loader_no_shuffle = DataLoader(validation_set, **loading_params_no_shuffle)


# Prepare the test data loader. The data loader will not be shuffled. 
testing_loader = DataLoader(test_set, **loading_params_no_shuffle)

In [None]:
#https://www.kaggle.com/datafan07/disaster-tweets-nlp-eda-bert-with-transformers

# loss_function = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE, # args.learning_rate
                  eps = EPSILON # args.adam_epsilon
                )

# number of training steps
total_steps = len(training_loader) * MAX_EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):    
    """A function that takes a time in seconds and returns a string hh:mm:ss"""
    
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# model = model.train()

for epoch in tqdm_notebook(range(MAX_EPOCHS)):
    # start time for each epoch
    t0 = time.time()
    
    total_train_loss = 0
    
    model.train()
    
    print("EPOCH -- {} / {}".format(epoch, MAX_EPOCHS))
    for step, batch in enumerate(training_loader):
        if step % 30 == 0 and not step == 0: 
            elapsed = format_time(time.time() - t0)
            print(' Batch {} of {}. Elapsed: {:}'.format(step, len(training_loader), elapsed))
            
        input_ids = batch[0].to(device).to(torch.int64)
        input_masks = batch[1].to(device).to(torch.int64)
        labels = batch[2].to(device).to(torch.int64)          
                  
        # Always clear any previously calculated gradients before performing a backward pass. PyTorch doesn't do this automatically because accumulating the gradients is 'convenient while training RNNs'. 
        model.zero_grad()
                  
        loss, logits = model(input_ids, 
                           token_type_ids=None,
                           attention_mask=input_masks, 
                           labels=labels)
                  
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        total_train_loss += loss.item()
        loss.backward()
                  
        # Clip the norm of the gradients to 1.0. This is to help 
        # prevent the 'exploding gradients' problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # update parameters and move a step forward using the computed gradients          
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(training_loader)
    training_time = format_time(time.time() - t0)
            
    print('')
    print(' Average training loss: {0:.4f}'.format(avg_train_loss))
    print(' Training epoch took: {:}'.format(training_time))
    
    print('Running Validation')
                  
    model.eval()
        
    val_predictions = []
    val_labels = []
    for batch in validation_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            loss, logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)
            
        val_predictions.append(logits.detach().cpu().numpy())
        val_labels.append(b_labels.to('cpu').numpy())

    val_predictions = np.array([item for sublist in val_predictions for item in sublist])
    val_labels = np.array([item for sublist in val_labels for item in sublist])
    
    val_predictions_flat = np.argmax(val_predictions, axis=1)
    print(val_predictions_flat.shape)
        
    val_accuracy = accuracy_score(val_labels, val_predictions_flat)
    val_recall = recall_score(val_labels, val_predictions_flat)
    val_precision = precision_score(val_labels, val_predictions_flat)
    val_f1 = f1_score(val_labels, val_predictions_flat)
    
    print('\tAccuracy: {0:.4f}\n\tRecall: {1:.4f}\n\tPrecision: {2:.4f}\n\tF1_score: {3:.4f}\n\t'.
          format(val_accuracy, val_recall, val_precision, val_f1))

In [None]:
model.eval()

# get prediction for training set.
roberta_train_preds = []
for batch in training_loader_no_shuffle:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    with torch.no_grad():
        loss, logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)
        
        roberta_train_preds.append(logits.detach().cpu().numpy())
roberta_train_preds = np.array([item for sublist in roberta_train_preds for item in sublist])

# get prediction for validation set.
roberta_val_preds = []
for batch in validation_loader_no_shuffle:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    with torch.no_grad():
        loss, logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)
        
        roberta_val_preds.append(logits.detach().cpu().numpy())
roberta_val_preds = np.array([item for sublist in roberta_val_preds for item in sublist])

# get prediction for training set.
roberta_test_preds = []
for batch in testing_loader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    
    with torch.no_grad():
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask)[0]
        
        roberta_test_preds.append(logits.detach().cpu().numpy())
roberta_test_preds = np.array([item for sublist in roberta_test_preds for item in sublist])

# convert into dataframe
roberta_train_preds_binary = np.argmax(roberta_train_preds, axis=1)
roberta_val_preds_binary = np.argmax(roberta_val_preds, axis=1)
roberta_test_preds_binary = np.argmax(roberta_test_preds, axis=1)

roberta_train_preds = pd.DataFrame(roberta_train_preds, columns=["roberta_fake","roberta_real"])
roberta_val_preds = pd.DataFrame(roberta_val_preds, columns=["roberta_fake","roberta_real"])
roberta_test_preds = pd.DataFrame(roberta_test_preds, columns=["roberta_fake","roberta_real"])

In [None]:
# test[['id', 'target']].to_csv("submission_roberta.csv", index=False)

# 6. Stacking Model

In [None]:
train_stacking = train.loc[:, ["target"]]
val_stacking = val.loc[:, ["target"]]
test_stacking = test.loc[:, []]


train_stacking["svm_predicted"] = svm_train_preds
# train_stacking["rf_predicted"] = rf_train_preds
# train_stacking["cnn_predicted"] = cnn_train_preds_binary
# train_stacking["roberta_predicted"] = roberta_train_preds_binary
train_stacking = pd.concat([train_stacking, roberta_train_preds], axis=1)
X_train_stacking = train_stacking.drop(["target"], axis=1)
y_train_stacking = train_stacking["target"]

val_stacking["svm_predicted"] = svm_val_preds
# val_stacking["rf_predicted"] = rf_val_preds
# val_stacking["cnn_predicted"] = cnn_val_preds_binary
# val_stacking["roberta_predicted"] = roberta_val_preds_binary
val_stacking = pd.concat([val_stacking, roberta_val_preds], axis=1)
X_val_stacking = val_stacking.drop(["target"], axis=1)
y_val_stacking = val_stacking["target"]

test_stacking["svm_predicted"] = svm_test_preds
# test_stacking["rf_predicted"] = rf_test_preds
# test_stacking["cnn_predicted"] = cnn_test_preds_binary
# test_stacking["roberta_predicted"] = roberta_test_preds_binary
test_stacking = pd.concat([test_stacking, roberta_test_preds], axis=1)
X_test_stacking = test_stacking

In [None]:
X_train_stacking

In [None]:
svm_clf_stacking = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

svm_clf_stacking.fit(X_train_stacking, y_train_stacking)
pred = svm_clf_stacking.predict(X_val_stacking)
print(f1_score(y_val_stacking,pred))

In [None]:
lr_clf_stacking = LogisticRegression(fit_intercept=True, 
                              random_state=RANDOM_STATE, 
                              solver = 'liblinear',
                              penalty = 'l1',
                              max_iter = 200)
lr_clf_stacking.fit(X_train_stacking, y_train_stacking)
lr_preds = lr_clf_stacking.predict(X_train_stacking)

print(accuracy_score(y_train_stacking,lr_preds))

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 100, max_depth=2, random_state = RANDOM_STATE)
rf_clf.fit(X_train_stacking, y_train_stacking)
rf_preds = rf_clf.predict(X_train_stacking)

print(accuracy_score(y_train_stacking, rf_preds))

In [None]:
test["target"] = lr_clf_stacking.predict(X_test_stacking)

In [None]:
test[['id', 'target']].to_csv("submission_stacking.csv", index=False)