In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


In [None]:
import re
import seaborn as sns

import matplotlib.pyplot as plt

from collections import defaultdict, Counter

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords

from wordcloud import WordCloud 
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords', quiet=True)
stopwords = stopwords.words('english')
sns.set(style="white", font_scale=1.2)
plt.rcParams["figure.figsize"] = [10,8]
pd.set_option.display_max_columns = 0
pd.set_option.display_max_rows = 0

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")

test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train.head()

In [None]:
train.shape, test.shape, test.shape[0]/train.shape[0]

In [None]:
print('There are {} rows and {} columns in train'.format(train.shape[0],train.shape[1]))

print('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))


In [None]:
train.info()

In [None]:
null_counts = pd.DataFrame({"Num_Null": train.isnull().sum()})

null_counts["Pct_Null"] = null_counts["Num_Null"] / train.count() * 100

null_counts

# Part 1:  EDA Analysis 

In [None]:
keywords_vc = pd.DataFrame({"Count": train["keyword"].value_counts()})

sns.barplot(y=keywords_vc[0:30].index, x=keywords_vc[0:30]["Count"], orient='h')

plt.title("Top 30 Keywords")

plt.show()

In [None]:
len(train["keyword"].value_counts())

In [None]:
disaster_keywords = train.loc[train["target"] == 1]["keyword"].value_counts()

nondisaster_keywords = train.loc[train["target"] == 0]["keyword"].value_counts()



In [None]:
fig, ax = plt.subplots(1,2, figsize=(25,15))

sns.barplot(y=disaster_keywords[0:30].index, x=disaster_keywords[0:30], orient='h', ax=ax[0], palette="Reds_d")

sns.barplot(y=nondisaster_keywords[0:30].index, x=nondisaster_keywords[0:30], orient='h', ax=ax[1], palette="Blues_d")

ax[0].set_title("Top 30 Keywords - Disaster Tweets")
ax[0].set_xlabel("Keyword Frequency")
ax[1].set_title("Top 30 Keywords - Non-Disaster Tweets")
ax[1].set_xlabel("Keyword Frequency")
plt.tight_layout()
plt.show()

In [None]:
armageddon_tweets = train[(train["keyword"].fillna("").str.contains("armageddon")) & (train["target"] == 0)]

print("An example tweet:\n", armageddon_tweets.iloc[10, 3])

armageddon_tweets.head()

In [None]:
def keyword_disaster_probabilities(x):
    tweets_w_keyword = np.sum(train["keyword"].fillna("").str.contains(x))
    tweets_w_keyword_disaster = np.sum(train["keyword"].fillna("").str.contains(x) & train["target"] == 1)
    return tweets_w_keyword_disaster / tweets_w_keyword

keywords_vc["Disaster_Probability"] = keywords_vc.index.map(keyword_disaster_probabilities)
keywords_vc.head()

In [None]:

keywords_vc.sort_values(by="Disaster_Probability", ascending=False).head(10)

In [None]:
keywords_vc.sort_values(by="Disaster_Probability").head(10)

In [None]:
locations_vc = train["location"].value_counts()

sns.barplot(y=locations_vc[0:30].index, x=locations_vc[0:30], orient='h')

plt.title("Top 30 Locations")
plt.show()

In [None]:
len(train["location"].value_counts())

In [None]:
disaster_locations = train.loc[train["target"] == 1]["location"].value_counts()

nondisaster_locations = train.loc[train["target"] == 0]["location"].value_counts()



fig, ax = plt.subplots(1,2, figsize=(20,8))
sns.barplot(y=disaster_locations[0:30].index, x=disaster_locations[0:30], orient='h', ax=ax[0], palette="Reds_d")
sns.barplot(y=nondisaster_locations[0:30].index, x=nondisaster_locations[0:30], orient='h', ax=ax[1], palette="Blues_d")

ax[0].set_title("Top 30 Locations - Disaster Tweets")
ax[0].set_xlabel("Keyword Frequency")
ax[1].set_title("Top 30 Locations - Non-Disaster Tweets")
ax[1].set_xlabel("Keyword Frequency")
plt.tight_layout()
plt.show()

In [None]:
train["tweet_length"] = train["text"].apply(len)

sns.distplot(train["tweet_length"])
plt.title("Histogram of Tweet Length")
plt.xlabel("Number of Characters")
plt.ylabel("Density")
plt.show()

In [None]:
min(train["tweet_length"]), max(train["tweet_length"])

In [None]:
g = sns.FacetGrid(train, col="target", height=5)

g = g.map(sns.distplot, "tweet_length")

plt.suptitle("Distribution Tweet Length")

plt.show()

In [None]:
def count_words(x):
    return len(x.split())

train["num_words"] = train["text"].apply(count_words)

sns.distplot(train["num_words"], bins=10)
plt.title("Histogram of Number of Words per Tweet")
plt.xlabel("Number of Words")
plt.ylabel("Density")
plt.show()

In [None]:
g = sns.FacetGrid(train, col="target", height=5)

g = g.map(sns.distplot, "num_words")

plt.suptitle("Distribution Number of Words")
plt.show()

In [None]:
def avg_word_length(x):
    return np.sum([len(w) for w in x.split()]) / len(x.split())

train["avg_word_length"] = train["text"].apply(avg_word_length)
sns.distplot(train["avg_word_length"])
plt.title("Histogram of Average Word Length")
plt.xlabel("Average Word Length")
plt.ylabel("Density")
plt.show()

In [None]:
g = sns.FacetGrid(train, col="target", height=5)
g = g.map(sns.distplot, "avg_word_length")

# Common Stopwords

In [None]:
def create_corpus(target):
    corpus = []

    for w in train.loc[train["target"] == target]["text"].str.split():
        for i in w:
            corpus.append(i)
            
    return corpus

def create_corpus_dict(target):
    corpus = create_corpus(target)
            
    stop_dict = defaultdict(int)
    for word in corpus:
        if word in stopwords:
            stop_dict[word] += 1
    return sorted(stop_dict.items(), key=lambda x:x[1], reverse=True)

In [None]:
corpus_disaster_dict = create_corpus_dict(0)
corpus_non_disaster_dict = create_corpus_dict(1)

disaster_x, disaster_y = zip(*corpus_disaster_dict)
non_disaster_x, non_disaster_y = zip(*corpus_non_disaster_dict)

fig, ax = plt.subplots(1,2, figsize=(20,8))
sns.barplot(y=list(disaster_x)[0:30], x=list(disaster_y)[0:30], orient='h', palette="Reds_d", ax=ax[0])
sns.barplot(y=list(non_disaster_x)[0:30], x=list(non_disaster_y)[0:30], orient='h', palette="Blues_d", ax=ax[1]) 

ax[0].set_title("Top 30 Stop Words - Disaster Tweets")
ax[0].set_xlabel("Stop Word Frequency")
ax[1].set_title("Top 30 Stop Words - Non-Disaster Tweets")
ax[1].set_xlabel("Stop Word Frequency")
plt.tight_layout()
plt.show()

# Common non-stopwords

In [None]:
corpus_disaster, corpus_non_disaster = create_corpus(1), create_corpus(0)
counter_disaster, counter_non_disaster = Counter(corpus_disaster), Counter(corpus_non_disaster)
x_disaster, y_disaster, x_non_disaster, y_non_disaster = [], [], [], []

counter = 0
for word, count in counter_disaster.most_common()[0:100]:
    if (word not in stopwords and counter < 15):
        counter += 1
        x_disaster.append(word)
        y_disaster.append(count)

counter = 0
for word, count in counter_non_disaster.most_common()[0:100]:
    if (word not in stopwords and counter < 15):
        counter += 1
        x_non_disaster.append(word)
        y_non_disaster.append(count)

fig, ax = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x=y_disaster, y=x_disaster, orient='h', palette="Reds_d", ax=ax[0])
sns.barplot(x=y_non_disaster, y=x_non_disaster, orient='h', palette="Blues_d", ax=ax[1])
ax[0].set_title("Top 15 Non-Stopwords - Disaster Tweets")
ax[0].set_xlabel("Word Frequency")
ax[1].set_title("Top 15 Non-Stopwords - Non-Disaster Tweets")
ax[1].set_xlabel("Word Frequency")
plt.tight_layout()
plt.show()

# Common Bigrams

In [None]:
def bigrams(target):
    corpus = train[train["target"] == target]["text"]
    count_vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = count_vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in count_vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

In [None]:
bigrams_disaster = bigrams(1)[:15]
bigrams_non_disaster = bigrams(0)[:15]

x_disaster, y_disaster = map(list, zip(*bigrams_disaster))
x_non_disaster, y_non_disaster = map(list, zip(*bigrams_non_disaster))

fig, ax = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x=y_disaster, y=x_disaster, orient='h', palette="Reds_d", ax=ax[0])
sns.barplot(x=y_non_disaster, y=x_non_disaster, orient='h', palette="Blues_d", ax=ax[1])

ax[0].set_title("Top 15 Bigrams - Disaster Tweets")
ax[0].set_xlabel("Word Frequency")
ax[1].set_title("Top 15 Bigrams - Non-Disaster Tweets")
ax[1].set_xlabel("Word Frequency")
plt.tight_layout()
plt.show()

# Taret distribution

In [None]:
target_vc = train["target"].value_counts(normalize=True)
print("Not Disaster: {:.2%}, Disaster: {:.2%}".format(target_vc[0], target_vc[1]))
sns.barplot(x=target_vc.index, y=target_vc)
plt.title("Histogram of Disaster vs. Non-Disaster")
plt.xlabel("0 = Non-Disaster, 1 = Disaster")
plt.show()

In [None]:
train

In [None]:
from nltk.corpus import stopwords

#function for removing pattern
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [None]:
# remove '#' handle
train['tweet'] = np.vectorize(remove_pattern)(train['text'], "#[\w]*")
test['tweet'] = np.vectorize(remove_pattern)(test['text'], "#[\w]*") 
train.head()

In [None]:
#Delete everything except alphabet
train['tweet'] = train['tweet'].str.replace("[^a-zA-Z#]", " ")

test['tweet'] = test['tweet'].str.replace("[^a-zA-Z#]", " ")

train.head()

In [None]:
#Dropping words whose length is less than 3
train['tweet'] = train['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test['tweet'] = test['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
train.head()


In [None]:
#convert all the words into lower case
train['tweet'] = train['tweet'].str.lower()

test['tweet'] = test['tweet'].str.lower()

In [None]:
set(stopwords.words('english'))

# set of stop words
stops = set(stopwords.words('english')) 

In [None]:
# tokens of words  
train['tokenized_sents'] = train.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

test['tokenized_sents'] = test.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)


In [None]:
#function to remove stop words
def remove_stops(row):
    my_list = row['tokenized_sents']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

In [None]:
#removing stop words
train['clean_tweet'] = train.apply(remove_stops, axis=1)

test['clean_tweet'] = test.apply(remove_stops, axis=1)

train.drop(["tweet","tokenized_sents"], axis = 1, inplace = True)
test.drop(["tweet","tokenized_sents"], axis = 1, inplace = True)


In [None]:
#re-join the words after tokenization
def rejoin_words(row):
    my_list = row['clean_tweet']
    joined_words = ( " ".join(my_list))
    return joined_words



train['clean_tweet'] = train.apply(rejoin_words, axis=1)

test['clean_tweet'] = test.apply(rejoin_words, axis=1)

train.head()

# Visualization of all the words using word cloud

In [None]:
all_word = ' '.join([text for text in train['clean_tweet']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_word) 
plt.figure(figsize=(15, 12)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off') 
plt.show()

# Visualization of all the words which signify real disaster

In [None]:
normal_words =' '.join([text for text in train['clean_tweet'][train['target'] == 1]]) 

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words) 

plt.figure(figsize=(15, 12)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# Visualization of all the words which signify unreal disaster
# 

In [None]:
normal_words =' '.join([text for text in train['clean_tweet'][train['target'] == 0]]) 

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words) 

plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

Import more librarries

In [None]:
import gc
import time
import math
import random
import warnings

In [None]:
import matplotlib.pyplot as plt

from datetime import date
from transformers import *
from sklearn.metrics import *
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

In [None]:
import string
import folium
from colorama import Fore, Back, Style, init


In [None]:
import scipy as sp
import networkx as nx
from pandas import Timestamp

from PIL import Image
from IPython.display import SVG
from keras.utils import model_to_dot

import requests
from IPython.display import HTML

In [None]:
from tqdm import tqdm
import matplotlib.cm as cm
import matplotlib.pyplot as plt

tqdm.pandas()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import transformers
import tensorflow as tf

In [None]:
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import accuracy_score, roc_auc_score
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger


In [None]:
#Tf 
from tensorflow.keras.models import Model
from kaggle_datasets import KaggleDatasets
from tensorflow.keras.optimizers import Adam
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding
from tensorflow.keras.layers import LSTM, GRU, Conv1D, SpatialDropout1D


In [None]:
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import activations
from tensorflow.keras import constraints
from tensorflow.keras import initializers
from tensorflow.keras import regularizers

import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.activations import *
from tensorflow.keras.constraints import *
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *


In [None]:
from sklearn import metrics
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,HashingVectorizer
from sklearn.model_selection import train_test_split

#Sklearn models-ML classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
#NLP 
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer  

import nltk
from textblob import TextBlob


from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize


In [None]:
stopword=set(STOPWORDS)


lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

np.random.seed(0)
random_state = 42

In [None]:
!pip install GPUtil


In [None]:
from torch import nn
from transformers import AdamW, BertConfig, BertModel, BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [None]:
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            del obj
    gc.collect()
    
    print("GPU Usage after emptying the cache")
    gpu_usage()

# 2nd Part

In [None]:
from torch import nn
from transformers import AdamW, BertConfig, BertModel, BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv").loc[:,["text","target"]]
train

In [None]:
# select device
if torch.cuda.is_available():        
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device

# **Data Preprocessing**

In [None]:
dupli_sum = train.duplicated().sum()
if(dupli_sum>0):
    print(dupli_sum, " duplicates found\nremoving...")
    train = train.loc[False==train.duplicated(), :]
else:
    print("no duplicates found")
train

In [None]:
X_train = train["text"].values
y_train = train["target"].values

In [None]:
X_train

In [None]:
y_train

BERT depends on a special tokenizing format & vocabulary. Thus, we need to use its custom tokenizer.

We should take a look at the resulting number of tokens that we would get by using the tokenizer without padding/truncating the sequences. Based on that we can define our padding/truncating-strategy.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

lens = []

for text in X_train:
    encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    lens.append(encoded_dict['input_ids'].size()[1])

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

ax.set_title("Frequency of Tokens in the whole Dataset")
ax.set_xlabel("Token ID")
ax.set_ylabel("Frequency")


pd.Series(lens).value_counts().head(50).plot(kind="bar");

print("text length mean: ", np.array(lens).mean())
print("text length median: ", np.median(lens))
print("text length standard deviation: ", np.array(lens).std())
print("suitable sequence length: ", np.array(lens).mean() + 2*np.array(lens).std())

**Findings**:

no huge difference between mean and median
truncate the texts to a tokenized length of  
⌈
  mean + 2  ⋅ std  ⌉
  many tokens for more generalization  
→
  sequence_lengthsequence_length = 58

In [None]:
sequence_length = 58
# X_train_tokens[i] := sequence of sequence_length many tokens that represent text_{i}
X_train_tokens = []

for text in X_train:
    encoded_dict = tokenizer.encode_plus(text,
                                         add_special_tokens=True, # special tokens for BERT
                                         max_length=sequence_length,
                                         padding="max_length",
                                         return_tensors='pt', # pytorch tensor format
                                         truncation=True)
    X_train_tokens.append(encoded_dict['input_ids'])

In [None]:
# pytorch expects tensors
X_train_tokens = torch.cat(X_train_tokens, dim=0) # concat into one tensor

y_train = torch.tensor(y_train)

In [None]:
# tokenization of the first observation in the training set
# zero-padding is added
print('Original:\n', X_train[5])
print('Tokenization:\n', X_train_tokens[5])

# Dataloader
We will use Dataloaders to draw batches of data for our model individually.

One Dataloader is used to draw batches for training the model and the other Dataloader is used to draw data for validating its performance.


* create a training set and a validation set
* create a dataloader for each of these sets with a defined sampling policy and a batch size

In [None]:
batch_size = 32

# split into training and validation data
dataset = TensorDataset(X_train_tokens, y_train.float())

train_size = int(0.80 * len(dataset))
val_size = len(dataset) - train_size

train_set, val_set = random_split(dataset, [train_size, val_size])


train_dataloader = DataLoader(train_set, 
                              sampler=RandomSampler(train_set), 
                              batch_size=batch_size)

validation_dataloader = DataLoader(val_set, 
                                   sampler=RandomSampler(val_set), 
                                   batch_size=batch_size)

Let's take a look at what we get from our Dataloaders:

In [None]:
for batch in train_dataloader:
    print("what is drawn from our dataloader? ", type(batch))
    
    print("\nfirst entry: ", batch[0].size(), type(batch[0]), batch[0].dtype)
    print("\nsecond entry: ", batch[1].size(), type(batch[1]), batch[1].dtype)
    
    break

We can iteratively draw batches from the Dataloader

* each Batch is stored in a list
* the first entry of the list is a tensor of the dimension batch_size x features. The tokens are stored in it
* the second entry of the list is a tensor of dimension batch_size. The class labels of each observation in the batch are stored in it

# Model Creation
Our model consists of two main components. The first component is BERT, which creates a feature representation from given text sequences.

The second component is a classifier plugin which is used on top of the feature representation created by BERT.

To match our classifier to the feature representations, we have to investigate the latter:

In [None]:
bert = BertModel.from_pretrained("bert-base-uncased")
bert.to(device)

for batch in train_dataloader: 
    batch_features = batch[0].to(device)
    bert_output = bert(input_ids=batch_features) 
    
    print("bert output: ", type(bert_output), len(bert_output))
    
    print("first entry: ", type(bert_output[0]), bert_output[0].size())
    
    print("second entry: ", type(bert_output[1]), bert_output[1].size())
    
    break

# For each batch, BERT provides

* an iterable having two entries
* the first entry contains a tensor of size batch_size x sequence_length x 768, which stores the representation of each token in each sequence of the batch. As we can see, each single token is represented in a vector fo size 768.
* the second entry contains a tensor of size batch_size x 768. It contains the pooled representation of the whole sequence per observation in our batch. This is what we want to use as an interface.

In [None]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased') # returns pwerful representations of the microblogs
        self.linear = nn.Linear(768, 1) # custom layer; input of the first custom layer has to match the dimensionality of the BERT-output; further custom layers are possible
        self.sigmoid = nn.Sigmoid() # activation function applied to our custom layer to obtain probabilities
    
    def forward(self, tokens):
        bert_output = self.bert(input_ids=tokens)
        linear_output = self.linear(bert_output[1])
        proba = self.sigmoid(linear_output)
        return proba

# Train the Model and evaluate it on the Validation Set
note: this is a greedy training loop that takes the model with the best possible validation score. A more robust approach would be to consider the difference between the training score and the validation score as well.

Early stopping w.r.t. the epochs would most likely not provide any improvements, since binary classification ontop of BERT usually overfits already after very few epochs (2 to 4).

In [None]:
def eval(y_batch, probas):
    
    preds_batch_np = np.round(probas.cpu().detach().numpy())
    
    y_batch_np = y_batch.cpu().detach().numpy()
    
    acc = accuracy_score(y_true=y_batch_np, y_pred=preds_batch_np)
    
    f1 = f1_score(y_true=y_batch_np, y_pred=preds_batch_np, average='weighted')
    
    return acc, f1


In [None]:

def train(model, optimizer, scheduler, epochs, name):
    history = []
    best_f1 = 0
    model.train()
    
    for epoch in range(epochs):
        
        # ===== train =====
        print("=== Epoch: ", epoch+1, " / ", epochs, " ===")
        acc_total = 0
        f1_total = 0
        
        for it, batch in enumerate(train_dataloader): 
            
            x_batch, y_batch = [batch[0].to(device), batch[1].to(device)] # draw the batch
            
            probas = torch.flatten(model(tokens=x_batch))
            
            acc_f1_batch = eval(y_batch, probas)
            
            acc_total, f1_total = acc_total + acc_f1_batch[0], f1_total + acc_f1_batch[1]
            
            model.zero_grad() # reset the gradients
            
            loss_func = nn.BCELoss()
            
            batch_loss = loss_func(probas, y_batch)
            
            batch_loss.backward() # calculate gradient per (learnable) weight
            
            optimizer.step() # update (learnable) weights
            
            scheduler.step() # update learning rate
            
        acc_total = acc_total/len(train_dataloader) #len(train dataloader)=num_batches
        
        f1_total = f1_total/len(train_dataloader)
        
        print("accuracy: ", acc_total, "\nf1: ", f1_total)
        
        

        # ===== validate =====
        acc_val_total = 0
        f1_val_total = 0
        
        for batch in validation_dataloader:
            
            x_batch, y_batch = [batch[0].to(device), batch[1].to(device)]
            
            with torch.no_grad(): # gradients don't have to be computed, because no update is performed
                probas = torch.flatten(model(tokens=x_batch))
            acc_f1_val_batch = eval(y_batch, probas)
            
            acc_val_total, f1_val_total = acc_val_total + acc_f1_val_batch[0], f1_val_total + acc_f1_val_batch[1]
            
        acc_val_total = acc_val_total/len(validation_dataloader)
        f1_val_total = f1_val_total/len(validation_dataloader)
        
        print("validation accuracy: ", acc_val_total, "\nvalidation f1: ", f1_val_total, "\n")
        if(f1_val_total>best_f1): # save current mdoel if this epoch improved models validation performance 
            torch.save(model, name+".pt")
            best_f1 = f1_val_total

        history.append({"acc":acc_total, "f1":f1_total, "acc_val":acc_val_total, "f1_val":f1_val_total})
    return [torch.load(name+".pt"), history]

In [None]:
epochs = 10

baseline_bert_clf = BertClassifier()

baseline_bert_clf = baseline_bert_clf.to(device)

adam = AdamW(baseline_bert_clf.parameters(), lr=5e-5, eps=1e-8)

total_steps = len(train_dataloader) * epochs

sched = get_linear_schedule_with_warmup(adam, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
baseline_bert_clf, history = train(model=baseline_bert_clf,
                                   optimizer=adam,
                                   scheduler=sched,
                                   epochs=10,
                                   name="baseline_bert_clf")

In [None]:
history_df = pd.DataFrame(history)

history_df

In [None]:
# analogously to above:
X_test = pd.read_csv("../input/nlp-getting-started/test.csv")["text"]

X_test_tokens = []

for text in X_test:
    
    encoded_dict = tokenizer.encode_plus(text,
                                         add_special_tokens=True,
                                         max_length=sequence_length,
                                         padding="max_length",
                                         return_tensors='pt',
                                         truncation=True)
    
    X_test_tokens.append(encoded_dict['input_ids'])
    
    
    
X_test_tokens = torch.cat(X_test_tokens, dim=0)



test_set = TensorDataset(X_test_tokens)

test_dataloader = DataLoader(test_set, 
                             sampler=SequentialSampler(test_set), 
                             batch_size=batch_size)

In [None]:
fig, ax = plt.subplots()
ax.set_title("f1 history")
ax.set_ylabel("f1")
ax.set_xlabel("epoch")
plt.xticks(ticks=np.arange(0,20), labels=np.arange(1,21))
history_df.loc[:,['f1', 'f1_val']].plot(ax=ax)
plt.savefig("f1.png")


# Predict on Test

In [None]:
# analogously to above:
X_test = pd.read_csv("../input/nlp-getting-started/test.csv")["text"]

X_test_tokens = []
for text in X_test:
    encoded_dict = tokenizer.encode_plus(text,
                                         add_special_tokens=True,
                                         max_length=sequence_length,
                                         padding="max_length",
                                         return_tensors='pt',
                                         truncation=True)
    X_test_tokens.append(encoded_dict['input_ids'])
X_test_tokens = torch.cat(X_test_tokens, dim=0)



test_set = TensorDataset(X_test_tokens)
test_dataloader = DataLoader(test_set, 
                             sampler=SequentialSampler(test_set), 
                             batch_size=batch_size)

In [None]:
all_preds = []

for batch in test_dataloader:
    x_batch = batch[0].to(device)
    with torch.no_grad():
        probas = baseline_bert_clf(tokens=x_batch)
    preds = np.round(probas.cpu().detach().numpy()).astype(int).flatten()
    all_preds.extend(preds)

In [None]:
challenge_pred = pd.concat([pd.read_csv("../input/nlp-getting-started/sample_submission.csv")["id"], pd.Series(all_preds)], axis=1)
challenge_pred.columns = ['id', 'target']
challenge_pred.to_csv("submission.csv", index=False)