In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('good_quality.csv')
df.head()

Unnamed: 0,text,tag
0,"killa cam , killa cam , camkilla cam , killa c...",rap
1,"yeah , hah , yeah , roc a fellawe invite you t...",rap
2,"ugh , killa baby kanye , this that heron flow ...",rap
3,so they ask me young boywhat you gon ' do the ...,rap
4,"hahauh huhno homo young mula , baby i say , he...",rap


In [3]:
# get distinct tags
target_cols = df['tag'].unique()
target_cols

array(['rap', 'pop', 'rock', 'rb', 'country'], dtype=object)

In [4]:
df.tag.value_counts()

tag
rap        105822
pop         98869
rock        83318
rb          45667
country     34872
Name: count, dtype: int64

In [5]:
# sample 30% of the data
df = df.sample(frac=0.5).reset_index(drop=True)
# reset the index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,text,tag
0,"deniz st k p r rhey hey can m , rinna nay , ri...",rock
1,",",rock
2,gezer on the beat gece gene tehlikeli ve bad t...,rap
3,"yeah , when the devil come boythe devil gon ' ...",rap
4,s l nge skutan kan g s l nge hj rtat kan sl s ...,pop


In [6]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
label = encoder.fit_transform(df[['tag']]).toarray()

# append one hot encoded label to the dataframe with distinct tags as columns
df = pd.concat([df, pd.DataFrame(label, columns=target_cols)], axis=1)
df.head()

Unnamed: 0,text,tag,rap,pop,rock,rb,country
0,"deniz st k p r rhey hey can m , rinna nay , ri...",rock,0.0,0.0,0.0,0.0,1.0
1,",",rock,0.0,0.0,0.0,0.0,1.0
2,gezer on the beat gece gene tehlikeli ve bad t...,rap,0.0,0.0,1.0,0.0,0.0
3,"yeah , when the devil come boythe devil gon ' ...",rap,0.0,0.0,1.0,0.0,0.0
4,s l nge skutan kan g s l nge hj rtat kan sl s ...,pop,0.0,1.0,0.0,0.0,0.0


In [7]:
# split the data into training and validation
df.drop(columns=['tag'], inplace=True)
# reset the index
df.reset_index(drop=True, inplace=True)

df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)
df_train.shape, df_val.shape

((147419, 6), (36855, 6))

In [8]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [9]:
from transformers import AutoTokenizer, AutoModel

In [10]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [11]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [12]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_val, tokenizer, MAX_LEN)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [12]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
        # self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,5)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        # output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'device' is not defined

In [13]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [14]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)



In [18]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%40 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}, processed: {_/len(train_loader)*100}%')
            
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [24]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.14179185032844543, processed: 0.0%
Epoch: 0, Loss:  0.1984497457742691, processed: 0.8682439765574127%
Epoch: 0, Loss:  0.13869385421276093, processed: 1.7364879531148254%
Epoch: 0, Loss:  0.18641141057014465, processed: 2.604731929672238%
Epoch: 0, Loss:  0.2038928121328354, processed: 3.472975906229651%
Epoch: 0, Loss:  0.17198549211025238, processed: 4.341219882787064%
Epoch: 0, Loss:  0.11221449822187424, processed: 5.209463859344476%
Epoch: 0, Loss:  0.2029007524251938, processed: 6.077707835901888%
Epoch: 0, Loss:  0.10868886858224869, processed: 6.945951812459302%
Epoch: 0, Loss:  0.1311841458082199, processed: 7.814195789016713%
Epoch: 0, Loss:  0.1835675686597824, processed: 8.682439765574127%
Epoch: 0, Loss:  0.12578506767749786, processed: 9.55068374213154%
Epoch: 0, Loss:  0.1619919389486313, processed: 10.418927718688952%
Epoch: 0, Loss:  0.07997047156095505, processed: 11.287171695246364%
Epoch: 0, Loss:  0.10955436527729034, processed: 12.1554156718037

In [15]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [26]:
from sklearn import metrics

outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.6339167005833672
F1 Score (Micro) = 0.6570641634580253
F1 Score (Macro) = 0.6287798081604311


In [27]:
torch.save(model.state_dict(), 'model.bin')

## Load the model

In [24]:
model = BERTClass()
model.load_state_dict(torch.load('model.bin'))
model.to('cpu')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [25]:
def predict(text):
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        truncation=True,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True
    )
    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).unsqueeze(0)
    outputs = model(ids, mask, token_type_ids)
    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    return outputs

In [45]:
test = df_val.iloc[:10]

In [62]:
i = 2

In [63]:
test.iloc[i].text

"crushing in the sheets , two bodies in the heatwe ' re in love , we re fucking in the cartime flies when you don ' t wanna die so by this rate we gotta try and stay alive cause we ' re tight , two peas in a pod , two hooks sinking down from the same little fishing rodnothing makes me happier than watching you winhey that s something new , i was a selfish kidyeah that ' s pretty weird right , that ' s pretty strangewell have you ever seen a jellyfish ? now that shit ' s crazyi think you saved mei know you saved mei don ' t even know if i ' m real or if you are or if we ' re just a simulation staged from a sports bar on marswhere the aliens drink , watching dishes build up in the kitchen sinkand they ' re placing their bets , hanging their heads , got money on who s gonna die in bedwho s gonna play with themselves againwho ' s breaking down mentally nextbut how nice is it , that i can watch you wini couldn t care less which world i ' m inand that ' s the only sense that i can makeand i 

In [64]:
test.iloc[i]

text       crushing in the sheets , two bodies in the heatwe ' re in love , we re fucking in the cartime flies when you don ' t wanna die so by this rate we gotta try and stay alive cause we ' re tight , two peas in a pod , two hooks sinking down from the same little fishing rodnothing makes me happier than watching you winhey that s something new , i was a selfish kidyeah that ' s pretty weird right , that ' s pretty strangewell have you ever seen a jellyfish ? now that shit ' s crazyi think you saved mei know you saved mei don ' t even know if i ' m real or if you are or if we ' re just a simulation staged from a sports bar on marswhere the aliens drink , watching dishes build up in the kitchen sinkand they ' re placing their bets , hanging their heads , got money on who s gonna die in bedwho s gonna play with themselves againwho ' s breaking down mentally nextbut how nice is it , that i can watch you wini couldn t care less which world i ' m inand that ' s the only sense that i can 

In [54]:
for text in test.text:
    outputs = predict(text)
    outputs_max = np.argmax(outputs)
    tag = target_cols[outputs_max]
    print(outputs)
    print(f"Predicted Tag: {tag}")
    print()


[[0.00173563 0.00118704 0.9944351  0.00154147 0.00105888]]
Predicted Tag: rock

[[8.0763958e-03 4.2739205e-02 2.8744020e-04 2.6778230e-03 9.5094585e-01]]
Predicted Tag: country

[[0.0039003  0.09399678 0.00234132 0.00330439 0.91061956]]
Predicted Tag: country

[[0.00172226 0.00691838 0.00162187 0.00150994 0.99212986]]
Predicted Tag: country

[[0.00174412 0.0063479  0.00193696 0.00143883 0.9921787 ]]
Predicted Tag: country

[[9.7395533e-01 1.9941580e-02 2.9332418e-04 3.4603490e-03 7.9612574e-03]]
Predicted Tag: rap

[[8.4874898e-01 1.4867063e-01 1.7747722e-04 6.4873165e-03 1.4381234e-02]]
Predicted Tag: rap

[[0.00156563 0.8235386  0.13255306 0.05709378 0.00647233]]
Predicted Tag: pop

[[0.00133492 0.00102144 0.9942796  0.00194161 0.00119073]]
Predicted Tag: rock

[[0.00441524 0.51239955 0.00279554 0.11510053 0.3540607 ]]
Predicted Tag: pop

