# Huggingface datasets



In [None]:
# Most basic stuff for EDA.

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 150)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# !pip install transformers
# !pip install datasets
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 

import warnings
warnings.filterwarnings("ignore")

# Read the data


In [None]:
train_tweets = pd.read_csv('../data/train.csv')
test_tweets = pd.read_csv('../data/test.csv')

In [None]:
train_tweets.head()

# EDA

In [None]:
sns.set_style('whitegrid')
sns.countplot(y=train_tweets['target'])

In [None]:
train_tweets['location'].value_counts().head(n=20)


Let's clean the text

In [None]:
# Some basic helper functions to clean text by removing urls, emojis, html tags and punctuations.
import re
import string

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions on Train Dataset

train_tweets['text_clean'] = train_tweets['text'].apply(lambda x: remove_URL(x))
train_tweets['text_clean'] = train_tweets['text_clean'].apply(lambda x: remove_emoji(x))
train_tweets['text_clean'] = train_tweets['text_clean'].apply(lambda x: remove_html(x))
train_tweets['text_clean'] = train_tweets['text_clean'].apply(lambda x: remove_punct(x))

# Applying helper functions on Test Dataset

test_tweets['text_clean'] = test_tweets['text'].apply(lambda x: remove_URL(x))
test_tweets['text_clean'] = test_tweets['text_clean'].apply(lambda x: remove_emoji(x))
test_tweets['text_clean'] = test_tweets['text_clean'].apply(lambda x: remove_html(x))
test_tweets['text_clean'] = test_tweets['text_clean'].apply(lambda x: remove_punct(x))

In [None]:
train_tweets.fillna("",inplace=True)
test_tweets.fillna("",inplace=True)

In [None]:
model_nm = 'microsoft/deberta-v3-small'

In [None]:
tokz = AutoTokenizer.from_pretrained(model_nm)

In [None]:
sep = tokz.sep_token

In [None]:
train_tweets['train'] = train_tweets['text_clean'] + sep + train_tweets['location'] + sep + train_tweets['keyword']

In [None]:
train_tweets.head()

# Training

Time to import some stuff we'll need for training

In [None]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer

from datasets import load_dataset, Dataset, DatasetDict


In [None]:
def tok_func(x):
    return tokz(x['train'])

In [None]:
ds = Dataset.from_pandas(train_tweets)
eval_ds = Dataset.from_pandas(test_tweets)

In [None]:
train_tweets.columns

In [None]:
tok_ds = ( ds.map(tok_func, batched=True, remove_columns=['id', 'keyword', 'location', 'text', 'text_clean'])
               .rename_column('target','label'))

In [None]:
tok_ds[0]

In [None]:
dds = tok_ds.train_test_split(test_size=0.2)

# Initial model

In [None]:
lr,bs = 8e-5,32
wd,epochs = 0.01,4

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=wd, report_to='none')

We can now create our model, and `Trainer` which is a class which combines the data and mdoel together (just like Learner in fastai)

In [None]:
from datasets import load_metric
accuracy = load_metric("accuracy")
f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy.compute(predictions = predictions, references=labels)['accuracy'],
        'f1': f1.compute(predictions = predictions, references=labels)['f1']
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
               tokenizer=tokz, compute_metrics=compute_metrics)

In [None]:
#trainer.train()

# New Model

In [None]:
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoModel, AutoConfig

In [None]:
class CustomModel(nn.Module):
    def __init__(self, checkpoint, num_labels):
        super(CustomModel, self).__init__()
        self.num_labels = num_labels
        
        # Load a model with given checkpoint and extract its body
        self.model  = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)
        
    def forward(self, input_ids = None, attention_mask = None, labels = None):
        # Extract outputs from the body
        outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)
        
        # Adds a custom layer
        sequence_output = self.dropout(outputs[0])
        
        logits = self.classifier(sequence_output[:,0,:].view(-1,768))
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            

        return SequenceClassifierOutput(loss = loss, logits = logits, hidden_states = outputs.hidden_states, attentions=outputs.attentions)


In [None]:
custom_model = CustomModel(checkpoint = model_nm,num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)

trainer = Trainer(custom_model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
               tokenizer=tokz, compute_metrics=compute_metrics)

In [None]:
#trainer.train(ignore_keys_for_eval=['hidden_states','attentions'])

# LSTM model

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence

class LSTMModel(nn.Module):
    def __init__(self, checkpoint, num_labels):
        super(LSTMModel, self).__init__()
        self.num_labels = num_labels
        
        # Load a model with given checkpoint and extract its body
        self.model  = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.lstm_hidden_size = 768
        self.lstm = nn.LSTM(768, self.lstm_hidden_size, bidirectional=True)
        
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(2*self.lstm_hidden_size, num_labels)
        
    def forward(self, input_ids = None, attention_mask = None, labels = None):
        # Extract outputs from the body
        # get sentence length with pad_id = 0
        sent_lengths = attention_mask.sum(dim=-1).cpu()
        #sent_lengths = get_sent_lengths(input_ids)
        outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)
        
        
        # pool the output using LSTM layer
        
        # sequence output is batch_size, seq_length, hidden_dim
        enc_hiddens, (last_hidden, last_cell) = self.lstm(pack_padded_sequence(outputs[0], sent_lengths, enforce_sorted=False, batch_first=True))
        
        output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1)  # (batch_size, 2*hidden_size)
        output_hidden = self.dropout(output_hidden)
        
        logits = self.classifier(output_hidden)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            

        return SequenceClassifierOutput(loss = loss, logits = logits, hidden_states = outputs.hidden_states, attentions=outputs.attentions)


In [None]:
custom_model = LSTMModel(checkpoint = model_nm,num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=2)

trainer = Trainer(custom_model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
               tokenizer=tokz, compute_metrics=compute_metrics)

In [None]:
trainer.train(ignore_keys_for_eval=['hidden_states','attentions'])

# Eval dataset

In [None]:
metrics = trainer.evaluate(eval_dataset = dds['test'])

In [None]:
metrics

In [None]:
predictions = trainer.predict(dds['test'],metric_key_prefix='predict').predictions
predictions = np.argmax(predictions,axis=1)

In [None]:
predictions