# Natural Language Processing with Disaster Tweets
Predict which Tweets are about real disasters and which ones are not

## Imports

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from collections import  Counter
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
stop=set(stopwords.words('english'))

from tqdm.notebook import tqdm

import os
import re
import time
import string
import random
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

from torchtext import data, datasets
from torchtext.vocab import Vectors, GloVe

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

In [None]:
# Import Data

train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train.head()

## Exploratory Data Analysis

In [None]:
print('There are {} rows and {} columns in train'.format(train.shape[0],train.shape[1]))
print('There are {} rows and {} columns in test'.format(test.shape[0],test.shape[1]))

### Class Distribution
Before going any further, let's quickly check the class distribution. There are only two classe, `0` (No disaster) and `1` (Disaster). 

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(17, 4), dpi=100)
train.groupby('target').count()['id'].plot(kind='pie', ax=axes[0], labels=['Not Disaster (57%)', 'Disaster (43%)'])
sns.countplot(x=train['target'], hue=train['target'], ax=axes[1])

axes[0].set_ylabel('')
axes[1].set_ylabel('')
axes[1].set_xticklabels(['Not Disaster (4342)', 'Disaster (3271)'])
axes[0].tick_params(axis='x', labelsize=15)
axes[0].tick_params(axis='y', labelsize=15)
axes[1].tick_params(axis='x', labelsize=15)
axes[1].tick_params(axis='y', labelsize=15)

axes[0].set_title('Target Distribution in Training Set', fontsize=13)
axes[1].set_title('Target Count in Training Set', fontsize=13)

plt.show()

There are more tweets with class `0` than with class `1`.

## Number of characters in Tweets

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

# No Disaster Tweets
train_len = train[train['target']==0]['text'].str.len()
ax1.hist(train_len,color='green')
ax1.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')

# Disaster Tweets
train_len = train[train['target']==1]['text'].str.len()
ax2.hist(train_len,color='red')
ax2.set_title('Disaster tweets')

plt.show()

The character distribution is almost the same in both cases. We can also see that no tweets exceeds 160 characters. 

## Number of words in Tweets

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))

train_len = train[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax1.hist(train_len,color='green')
ax1.set_title('Not disaster tweets')

train_len = train[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax2.hist(train_len,color='red')
ax2.set_title('Disaster tweets')

fig.suptitle('Words in a tweet')
plt.show()

In [None]:
def create_corpus(target):
    corpus=[]
    
    for x in train[train['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

corpus0 = create_corpus(0)
corpus1 = create_corpus(1)
len(corpus0)

## Common Stopwords in Tweets

In computing, stop words are words which are filtered out before or after processing of natural language data. [Wikipedia](https://en.wikipedia.org/wiki/Stop_word)
‚Äústop words‚Äù usually refers to the most common words in a language. There is no universal list of ‚Äústop words‚Äù that is used by all NLP tools in common. Stopwords are the words in any language which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence.

Here, we'll use NLTK, The Natural Language Toolkit, to process stopwords in the tweets.

First, we'll analyze tweets with class `0`, i.e. No real disaster.

In [None]:
dic=defaultdict(int)
for word in corpus0:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    
x,y=zip(*top)
plt.bar(x,y, color="green")

Now,we will analyze tweets with class `1`.

In [None]:
dic=defaultdict(int)
for word in corpus1:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    
x,y=zip(*top)
plt.bar(x,y)

### Analyzing Punctuation
First let's check tweets indicating real disaster.

In [None]:
plt.figure(figsize=(10,5))

dic=defaultdict(int)

special = string.punctuation
for i in corpus1:
    if i in special:
        dic[i]+=1
        
x,y = zip(*dic.items())
plt.bar(x,y)

Now,we will move on to class `0`.

In [None]:
plt.figure(figsize=(10,5))

dic=defaultdict(int)

special = string.punctuation
for i in (corpus0):
    if i in special:
        dic[i]+=1
        
x,y = zip(*dic.items())
plt.bar(x,y,color='green')

## Common Words
Let's take a look at the most common words in the dataset.

In [None]:
counter = Counter(corpus1)
most_common = counter.most_common()
x=[]
y=[]
for word,count in most_common[:40]:
    if (word not in stop) :
        x.append(word)
        y.append(count)
        
sns.barplot(x=y,y=x)

### Most common bigrams

In [None]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_bigrams(train['text'], 10)
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)

Just by looking at the most common words, we can see that a lot of data cleaning is needed before applying any kind of model if we wish to achieve a good accuracy.

# Data Cleaning

We're only interested in `text` and `target` columns in this case, so we'll drop the other columns.

In [None]:
df=pd.concat([train,test])
df.shape

In [None]:
df.drop(columns=['keyword','location'], inplace=True)
df.head()

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL("New competition launched :https://www.kaggle.com/c/nlp-getting-started")

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

remove_html("<h1>Real or Fake</h1>")

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake üòîüòî")

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

remove_punct("I am a #king")

In [None]:
def clean_data(text):
    text = text.apply(lambda x : remove_URL(x))
    text = text.apply(lambda x : remove_html(x))
    text = text.apply(lambda x : remove_emoji(x))
    text = text.apply(lambda x : remove_punct(x))
    return text

In [None]:
df.text = clean_data(df.text)

Let's quickly see the most common bigrams now and see if data cleaning helped or not.

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_bigrams(df['text'], 10)
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)

It seems to be working fine. Most of the junk values have been cleaned up. Now it's time to vectorize our corpus, and for that, we'll use GloVe.

Good thing to do is [lemmatizing](https://en.wikipedia.org/wiki/Lemmatisation). We can do it using [nltk](http://www.nltk.org/book/) library.

> Lemmatization in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In [None]:
keywords = train.keyword.unique()[1:]
keywords = list(map(lambda x: x.replace('%20', ' '), keywords))

wnl = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    sentence_words = sentence.split(' ')
    new_sentence_words = list()
    
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace('#', '')
        new_sentence_word = wnl.lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
        
    new_sentence = ' '.join(new_sentence_words)
    new_sentence = new_sentence.strip()
    
    return new_sentence

In [None]:
df['text'] = df['text'].apply(lambda x: lemmatize_sentence(x))

# LSTM Model
First, we'll create LSTM model in PyTorch, and see how well it performs.

## PyTorch Dataset

In [None]:
# save train, test and validation datasets into separate csv files.
def prepare_csv(df_train, df_test, seed=27, val_ratio=0.3):
    idx = np.arange(df_train.shape[0])
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    val_size = int(len(idx) * val_ratio)
    
    if not os.path.exists('cache'):
        os.makedirs('cache')
    
    df_train.iloc[idx[val_size:], :][['id', 'target', 'text']].to_csv(
        'cache/dataset_train.csv', index=False
    )
    
    df_train.iloc[idx[:val_size], :][['id', 'target', 'text']].to_csv(
        'cache/dataset_val.csv', index=False
    )
    
    df_test[['id', 'text']].to_csv('cache/dataset_test.csv',
                   index=False)

In [None]:
# wrapper for iterating through TabularDataset
def get_iterator(dataset, batch_size, train=True,
                 shuffle=True, repeat=False):
    
    device = torch.device('cuda:0' if torch.cuda.is_available()
                          else 'cpu')
    
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=device,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False
    )
    
    return dataset_iter

## Word Embeddings
One way to feed our model text data is to treat each word in our vocabulary as a separate feature and one hot encode them (**Bag of Words encoding**). This works and performs decently, but there is a major drawback to this approach

#### Mathematical intuition:
We are interested in constructing a vector space to represent our words. Suppose our vocabulary is: 'cat', 'dog', 'plant', 'leaf', 'man', woman'. Then we can form a vector space via:

$$ cat = (1, 0, 0, 0, 0, 0) $$
$$ dog = (0, 1, 0, 0, 0, 0) $$
$$ plant = (0, 0, 1, 0, 0, 0) $$
$$ leaf = (0, 0, 0, 1, 0, 0) $$
$$ man = (0, 0, 0, 0, 1, 0) $$
$$ woman = (0, 0, 0, 0, 0, 1) $$
 
But these vectors form an orthogonal basis, so when we take the dot product of them, we get `0`:

$$ cat \cdot dog = (1, 0, 0, 0, 0, 0) \cdot (0, 1, 0, 0, 0, 0) = 0$$
 
This means that all of these vectors are as far as possible from each other in the vector space, i.e., they are not similar. But words like cat and dog are similar in meaning, so it would be great if our word encoddings could somehow capture this similarity

So, instead of following the Bag of Word approach, we will import the pre-trained words from GloVe and use them to construct our word embeddings. But what is GloVe?

"GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space."

For example, we can take the vector for King and subtract the vector for Man and the resulting vector is remarkably close to the vector for Queen. Using these word encodings as opposed to the Bag of Word encodings will substantially improve our classification accuracy.

For embeddings, here we are using [GloVe](https://nlp.stanford.edu/projects/glove/):

> Glove produces dense vector embeddings of words, where words that occur together are close in the resulting vector space.

In [None]:
import logging
from copy import deepcopy

LOGGER = logging.getLogger('tweets_dataset')

def get_dataset(fix_length=100, lower=False, vectors=None):
    
    if vectors is not None:
        lower=True
        
    LOGGER.debug('Preparing CSV files...')
    prepare_csv(train, test)
    
    TEXT = data.Field(sequential=True, 
#                       tokenize='spacy', 
                      lower=True, 
                      include_lengths=True, 
                      batch_first=True, 
                      fix_length=25)
    LABEL = data.Field(use_vocab=True,
                       sequential=False,
                       dtype=torch.float16)
    ID = data.Field(use_vocab=False,
                    sequential=False,
                    dtype=torch.float16)
    
    
    LOGGER.debug('Reading train csv files...')
    
    train_temp, val_temp = data.TabularDataset.splits(
        path='cache/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[
            ('id', ID),
            ('target', LABEL),
            ('text', TEXT)
        ]
    )
    
    LOGGER.debug('Reading test csv file...')
    
    test_temp = data.TabularDataset(
        path='cache/dataset_test.csv', format='csv',
        skip_header=True,
        fields=[
            ('id', ID),
            ('text', TEXT)
        ]
    )
    
    LOGGER.debug('Building vocabulary...')
    
    TEXT.build_vocab(
        train_temp, val_temp, test_temp,
        max_size=20000,
        min_freq=10,
        vectors=GloVe(name='6B', dim=300)  # We use it for getting vocabulary of words
    )
    LABEL.build_vocab(
        train_temp
    )
    ID.build_vocab(
        train_temp, val_temp, test_temp
    )
    
    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)
    
    train_iter = get_iterator(train_temp, batch_size=32, 
                              train=True, shuffle=True,
                              repeat=False)
    val_iter = get_iterator(val_temp, batch_size=32, 
                            train=True, shuffle=True,
                            repeat=False)
    test_iter = get_iterator(test_temp, batch_size=32, 
                             train=False, shuffle=False,
                             repeat=False)
    
    
    LOGGER.debug('Done preparing the datasets')
    
    return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter

In [None]:
TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter = get_dataset()

## PyTorch LSTM Model

In [None]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, weights):
        super(LSTMClassifier, self).__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = torch.nn.Embedding(vocab_size,
                                                  embedding_dim)
        self.word_embeddings.weight = torch.nn.Parameter(weights,
                                                         requires_grad=False)
        
        self.dropout_1 = torch.nn.Dropout(0.3)
        self.lstm = torch.nn.LSTM(embedding_dim,
                                  hidden_dim,
                                  n_layers,
                                  dropout=0.3,
                                  batch_first=True)
        
        self.dropout_2 = torch.nn.Dropout(0.3)
        self.label_layer = torch.nn.Linear(hidden_dim, output_size)
        
        self.act = torch.nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        x = self.word_embeddings(x)
        
        x = self.dropout_1(x)
        
        lstm_out, hidden = self.lstm(x, hidden)
                
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout_2(lstm_out)
        out = self.label_layer(out)    
        
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1, :]

        out = self.act(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden

In [None]:
def train_model(model, train_iter, val_iter, optim, loss, num_epochs, batch_size=32):
    h = model.init_hidden(batch_size)
    
    clip = 5
    val_loss_min = np.Inf
    
    total_train_epoch_loss = list()
    total_train_epoch_acc = list()
        
    total_val_epoch_loss = list()
    total_val_epoch_acc = list()
        
    
    device = torch.device('cuda:0' if torch.cuda.is_available()
                           else 'cpu')
    
    for epoch in range(num_epochs):

        model.train()
        
        train_epoch_loss = list()
        train_epoch_acc = list()
        
        val_epoch_loss = list()
        val_epoch_acc = list()
        
        for idx, batch in enumerate(tqdm(train_iter)):
            h = tuple([e.data for e in h])

            text = batch.text[0]
            target = batch.target
            target = target - 1
            target = target.type(torch.LongTensor)

            text = text.to(device)
            target = target.to(device)

            optim.zero_grad()
            
            if text.size()[0] is not batch_size:
                continue
            
            prediction, h = model(text, h)
                
            loss_train = loss(prediction.squeeze(), target)
            loss_train.backward()

            num_corrects = (torch.max(prediction, 1)[1].
                                view(target.size()).data == target.data).float().sum()

            acc = 100.0 * num_corrects / len(batch)

            train_epoch_loss.append(loss_train.item())
            train_epoch_acc.append(acc.item())
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            
            optim.step()
    
        print(f'Train Epoch: {epoch}, Training Loss: {np.mean(train_epoch_loss):.4f}, Training Accuracy: {np.mean(train_epoch_acc): .2f}%')

        model.eval()

        with torch.no_grad():
            for idx, batch in enumerate(tqdm(val_iter)):
                val_h = tuple([e.data for e in h])

                text = batch.text[0]
                target = batch.target
                target = target - 1
                target = target.type(torch.LongTensor)
                
                text = text.to(device)
                target = target.to(device)
                
                if text.size()[0] is not batch_size:
                    continue

                prediction, h = model(text, h)
                loss_val = loss(prediction.squeeze(), target)

                num_corrects = (torch.max(prediction, 1)[1].
                                view(target.size()).data == target.data).float().sum()

                acc = 100.0 * num_corrects / len(batch)

                val_epoch_loss.append(loss_val.item())
                val_epoch_acc.append(acc.item())
                
            print(f'Vadlidation Epoch: {epoch}, Training Loss: {np.mean(val_epoch_loss):.4f}, Training Accuracy: {np.mean(val_epoch_acc): .2f}%')
                
            if np.mean(val_epoch_loss) <= val_loss_min:
#                 torch.save(model.state_dict(), 'state_dict.pth')
                print('Validation loss decreased ({:.6f} --> {:.6f})'.
                      format(val_loss_min, np.mean(val_epoch_loss)))
                
                val_loss_min = np.mean(val_epoch_loss)
                
        total_train_epoch_loss.append(np.mean(train_epoch_loss))
        total_train_epoch_acc.append(np.mean(train_epoch_acc))
    
        total_val_epoch_loss.append(np.mean(val_epoch_loss))
        total_val_epoch_acc.append(np.mean(val_epoch_acc))
    
    return (total_train_epoch_loss, total_train_epoch_acc,
            total_val_epoch_loss, total_val_epoch_acc)

In [None]:
lr = 1e-4
batch_size = 32
output_size = 2
hidden_size = 128
embedding_length = 300

model = LSTMClassifier(vocab_size=vocab_size, 
                       output_size=output_size, 
                       embedding_dim=embedding_length,
                       hidden_dim=hidden_size,
                       n_layers=2,
                       weights=word_embeddings
)

device = torch.device('cuda:0' if torch.cuda.is_available()
                      else 'cpu')
    
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.CrossEntropyLoss()
    
train_loss, train_acc, val_loss, val_acc = train_model(model=model,
                                                       train_iter=train_iter,
                                                       val_iter=val_iter,
                                                       optim=optimizer,
                                                       loss=loss,
                                                       num_epochs=20,
                                                       batch_size=batch_size)
    

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Loss')
sns.lineplot(range(len(train_loss)), train_loss, label='train')
sns.lineplot(range(len(val_loss)), val_loss, label='test')

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Accuracy')
sns.lineplot(range(len(train_acc)), train_acc, label='train')
sns.lineplot(range(len(val_acc)), val_acc, label='test')

## Predictions using LSTM

In [None]:
results_target = list()

with torch.no_grad():
    for batch in tqdm(test_iter):
        for text, idx in zip(batch.text[0], batch.id):
            text = text.unsqueeze(0)
            res, _ = model(text, hidden=None)

            target = np.round(res.cpu().numpy())
            
            results_target.append(target[0][1])

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample_submission['target'] = list(map(int, results_target))
sample_submission.head()

In [None]:
sample_submission.to_csv('submission_lstm.csv', index=False)

# RoBerta Transformer

In [None]:
import tokenizers
import transformers

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

df=pd.concat([train,test])
df.drop(columns=['keyword','location'], inplace=True)
df.text = clean_data(df.text)

### Dataset Class
Dataset class is inherited from `torch.utils.data.Dataset` class. It is mandatory to overwrite `getitem()` and `len()` functions. This class will preprocess all data needed for the model. The text is tokenized using `roberta-base` vocabulary. RoBERTa model for text classification expects the input of the model be in form: 
> [cls_token] [......token_ids_of_tokenized_text.......] [sep_token]

We have taken `max_len=96` here. If a text is less than `max_len` we will pad them with 1's and set the corresponding mask 0.

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,df,y=None, max_len=128):
        self.df = df
        self.y = y
        self.max_len= max_len
        self.tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')
        
    def __getitem__(self,index):
        row = self.df.iloc[index]
        ids,masks = self.get_input_data(row)
        data = {}
        data['ids'] = ids
        data['masks'] = masks
        if self.y is not None:
            data['out'] = torch.tensor(self.y.iloc[index],dtype=torch.float32)
        return data
    
    def __len__(self):
        return len(self.df)
    
    def get_input_data(self,row):
        row = self.tokenizer.encode(row,add_special_tokens=True,add_prefix_space=True)
        padded = row + [0] * (self.max_len - len(row))
        padded = torch.tensor(padded, dtype=torch.int64)
        mask = torch.where(padded != 0 , torch.tensor(1), torch.tensor(0))
        return padded, mask

In [None]:
train_df = df[df.target.isnull() == False]
test_df = df[df.target.isnull() == True]

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_df.text, train_df.target, test_size=0.2, stratify=train_df.target)

In [None]:
batch_size = 32
num_workers = 2

train_loader = torch.utils.data.DataLoader(
    Dataset(train_x,train_y),
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)

val_loader = torch.utils.data.DataLoader(
    Dataset(val_x,val_y),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)

In [None]:
next(iter(train_loader))

In [None]:
class RobertaClassificationModel(nn.Module):
    def __init__(self):
        super(RobertaClassificationModel,self).__init__()
        self.distilBert = transformers.RobertaModel.from_pretrained('roberta-base')
        self.l0 = nn.Linear(768,512)
        self.d0 = nn.Dropout(0.5)
        self.l1 = nn.Linear(512,256)
        self.d1 = nn.Dropout(0.5)
        self.l2 = nn.Linear(256,1)
        self.d2 = nn.Dropout(0.5)
        
#         nn.init.normal_(self.l0.weight,std=0.2)
#         nn.init.normal_(self.l1.weight,std=0.2)
#         nn.init.normal_(self.l2.weight,std=0.2)
    
    def forward(self,ids,masks):
        hid = self.distilBert(ids,attention_mask=masks)
        hid = hid[0][:,0]
        x = self.d0(hid)
        x = self.l0(x)
        x = F.leaky_relu(x)
        x = self.d1(x)
        x = self.l1(x)
        x = F.leaky_relu(x)
        x = self.d2(x)
        x = self.l2(x)
        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = RobertaClassificationModel().to(device)

In [None]:
model

In [None]:
def accuracy_score(outputs,labels):
    outputs = torch.round(torch.sigmoid(outputs))
    correct = (outputs == labels).sum().float()
    return correct/labels.size(0)

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs):
    model.cuda()

    losses = {'train': list(), 'val': list()}
    accuracies = {'train': list(), 'val': list()}
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_acc = 0.0
            
            tk0 = tqdm(dataloaders_dict[phase], total=len(dataloaders_dict[phase]))
            
            for i, data in enumerate(tk0):

                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                labels = data['out'].cuda()
                labels = labels.unsqueeze(1)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    x = model(ids, masks)
                    loss = criterion(x, labels)
                    
                    if phase == 'train':
                        loss.backward(retain_graph=True)
                        optimizer.step()

                    epoch_loss += loss.item() * len(ids)
                    epoch_acc += accuracy_score(x, labels)

            epoch_loss = epoch_loss / len(dataloaders_dict[phase])
            epoch_acc = epoch_acc / len(dataloaders_dict[phase])
            
            losses[phase].append(epoch_loss)
            accuracies[phase].append(epoch_acc.item())

            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Accuracy: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_acc))
            

    torch.save(model.state_dict(), 'model.pth')
    
    return losses, accuracies

In [None]:
dataloaders_dict = {'train': train_loader, 'val': val_loader}

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
criterion = nn.BCEWithLogitsLoss()

In [None]:
num_epochs = 5

losses, accuracies = train_model(model, dataloaders_dict, criterion, optimizer, num_epochs)

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Loss')
sns.lineplot(range(len(losses['train'])), losses['train'], label='train')
sns.lineplot(range(len(losses['val'])), losses['val'], label='val')

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Accuracy')
sns.lineplot(range(len(accuracies['train'])), accuracies['train'], label='train')
sns.lineplot(range(len(accuracies['val'])), accuracies['val'], label='val')

## Predictions

In [None]:
def make_predictions(test_loader):
    predictions = []
    
    model = RobertaClassificationModel()
    model.cuda()
    model.load_state_dict(torch.load('/kaggle/working/model.pth'))
    model.eval()
    
    tk = tqdm(test_loader, total=len(test_loader))
    
    for _, data in enumerate(tk):
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        
        with torch.no_grad():
            outputs = model(ids,masks)
            predictions += outputs.cpu().detach().numpy().tolist()
        
    
    predictions = np.round(1/(1 + np.exp(-np.array(predictions))))
    predictions = np.array(predictions, dtype=np.uint8)
    return predictions

In [None]:
test_loader = torch.utils.data.DataLoader(
    Dataset(test_df['text'],y=None),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers)

In [None]:
next(iter(test_loader))

In [None]:
predictions = make_predictions(test_loader)

In [None]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['target'] = predictions
submission.to_csv('submission_roberta.csv', index=False)
submission.head()