In [1]:
import sys
sys.path.append('..')

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
from torch import nn
from transformers import AutoTokenizer, AutoModel
import torch
!set 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'


In [3]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [4]:
labels = 'toxic severe_toxic obscene threat insult identity_hate'.split()
random_state = 1
input_dim = 768
learning_rate = 0.001
batch_size = 16
epoch_num = 2
weight_decay = 0.01

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df_not_zero = df.loc[~(
    df.drop(['comment_text', 'id'], axis=1) == 0).all(axis=1)]
df_zero = df.loc[(
    df.drop(['comment_text', 'id'], axis=1) == 0).all(axis=1)]
df_zero = df_zero.iloc[:df_not_zero.shape[0]]

In [7]:
df = pd.concat([df_zero, df_not_zero]).sample(
    frac=1, random_state=random_state)
df = df.sample(frac=0.6)

In [8]:
device = torch.device('cuda')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from src.utils.preprocessing import Processor
processor = Processor(labels, tokenizer, device)
train, val = train_test_split(df, test_size=0.2, train_size=0.8, shuffle=True)

In [10]:
X_train = processor.tokenize_data(train)
X_val = processor.tokenize_data(val)
y_train = processor.extract_target(train)
y_val = processor.extract_target(val)

In [11]:
from torch.utils.data import DataLoader, TensorDataset
from src.models.classifier import Classifier
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size)
test_dataloader = DataLoader(TensorDataset(
    X_val, y_val), batch_size=batch_size)

classifier = Classifier(input_dim, device).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(
    classifier.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [12]:
from src.models.train import train_loop, test_loop
for epoch in range(epoch_num):
    train_loop(dataloader, model, classifier, loss_fn, optimizer)
    test_loop(
        test_dataloader, model, classifier, loss_fn)

1it [00:00,  1.11it/s]

loss: 0.713284  [   16/15576]


33it [00:21,  1.56it/s]

loss: 0.323902  [  528/15576]


65it [00:42,  1.56it/s]

loss: 0.580533  [ 1040/15576]


97it [01:03,  1.55it/s]

loss: 0.456023  [ 1552/15576]


129it [01:23,  1.52it/s]

loss: 0.391364  [ 2064/15576]


161it [01:43,  1.78it/s]

loss: 0.389000  [ 2576/15576]


193it [02:02,  1.62it/s]

loss: 0.381786  [ 3088/15576]


225it [02:23,  1.57it/s]

loss: 0.359925  [ 3600/15576]


257it [02:44,  1.53it/s]

loss: 0.315283  [ 4112/15576]


289it [03:05,  1.51it/s]

loss: 0.316853  [ 4624/15576]


321it [03:23,  1.79it/s]

loss: 0.300775  [ 5136/15576]


353it [03:41,  1.79it/s]

loss: 0.391315  [ 5648/15576]


385it [03:59,  1.79it/s]

loss: 0.406272  [ 6160/15576]


417it [04:17,  1.79it/s]

loss: 0.355827  [ 6672/15576]


449it [04:35,  1.79it/s]

loss: 0.216079  [ 7184/15576]


481it [04:53,  1.79it/s]

loss: 0.470193  [ 7696/15576]


513it [05:11,  1.78it/s]

loss: 0.477455  [ 8208/15576]


545it [05:29,  1.79it/s]

loss: 0.336466  [ 8720/15576]


577it [05:47,  1.78it/s]

loss: 0.358078  [ 9232/15576]


609it [06:05,  1.78it/s]

loss: 0.315844  [ 9744/15576]


641it [06:23,  1.79it/s]

loss: 0.344520  [10256/15576]


673it [06:41,  1.79it/s]

loss: 0.265690  [10768/15576]


705it [06:59,  1.79it/s]

loss: 0.312732  [11280/15576]


737it [07:17,  1.79it/s]

loss: 0.413596  [11792/15576]


769it [07:35,  1.80it/s]

loss: 0.339914  [12304/15576]


801it [07:53,  1.79it/s]

loss: 0.351562  [12816/15576]


833it [08:11,  1.79it/s]

loss: 0.414689  [13328/15576]


865it [08:29,  1.79it/s]

loss: 0.516447  [13840/15576]


897it [08:47,  1.82it/s]

loss: 0.313054  [14352/15576]


929it [09:05,  1.80it/s]

loss: 0.312462  [14864/15576]


961it [09:23,  1.79it/s]

loss: 0.376588  [15376/15576]


974it [09:30,  1.71it/s]
100%|██████████| 244/244 [00:49<00:00,  4.94it/s]


Test Error: 
 Accuracy: [0.57806882 0.94812532 0.73831536 0.98613251 0.74884438 0.95839753], Avg loss: 0.369683 



1it [00:00,  1.73it/s]

loss: 0.369560  [   16/15576]


33it [00:18,  1.80it/s]

loss: 0.321477  [  528/15576]


65it [00:37,  1.69it/s]

loss: 0.532433  [ 1040/15576]


97it [00:55,  1.77it/s]

loss: 0.437561  [ 1552/15576]


129it [01:14,  1.77it/s]

loss: 0.388731  [ 2064/15576]


161it [01:32,  1.77it/s]

loss: 0.393344  [ 2576/15576]


193it [01:51,  1.77it/s]

loss: 0.424722  [ 3088/15576]


225it [02:09,  1.76it/s]

loss: 0.371643  [ 3600/15576]


257it [02:27,  1.77it/s]

loss: 0.305359  [ 4112/15576]


289it [02:46,  1.76it/s]

loss: 0.313549  [ 4624/15576]


321it [03:04,  1.76it/s]

loss: 0.270863  [ 5136/15576]


353it [03:23,  1.75it/s]

loss: 0.394129  [ 5648/15576]


385it [03:41,  1.76it/s]

loss: 0.399679  [ 6160/15576]


417it [04:00,  1.76it/s]

loss: 0.372746  [ 6672/15576]


449it [04:18,  1.77it/s]

loss: 0.241230  [ 7184/15576]


481it [04:37,  1.74it/s]

loss: 0.446281  [ 7696/15576]


513it [04:56,  1.74it/s]

loss: 0.487410  [ 8208/15576]


545it [05:15,  1.71it/s]

loss: 0.332198  [ 8720/15576]


577it [05:34,  1.69it/s]

loss: 0.337737  [ 9232/15576]


609it [05:53,  1.69it/s]

loss: 0.326762  [ 9744/15576]


641it [06:11,  1.71it/s]

loss: 0.350348  [10256/15576]


673it [06:30,  1.70it/s]

loss: 0.285515  [10768/15576]


705it [06:49,  1.68it/s]

loss: 0.333383  [11280/15576]


737it [07:08,  1.69it/s]

loss: 0.396754  [11792/15576]


769it [07:27,  1.77it/s]

loss: 0.340893  [12304/15576]


801it [07:45,  1.71it/s]

loss: 0.352587  [12816/15576]


833it [08:04,  1.67it/s]

loss: 0.398247  [13328/15576]


865it [08:22,  1.70it/s]

loss: 0.526069  [13840/15576]


897it [08:41,  1.77it/s]

loss: 0.284543  [14352/15576]


929it [08:59,  1.78it/s]

loss: 0.321116  [14864/15576]


961it [09:18,  1.77it/s]

loss: 0.386942  [15376/15576]


974it [09:25,  1.72it/s]
100%|██████████| 244/244 [00:50<00:00,  4.80it/s]

Test Error: 
 Accuracy: [0.62455059 0.94812532 0.73780175 0.98613251 0.74910118 0.95839753], Avg loss: 0.361501 






In [19]:
x_test = processor.tokenize_data(df_test)
emb = model(x_test).last_hidden_state[:, 0]
with torch.no_grad():
    pred = classifier(emb)
    result = torch.sigmoid(pred)
merged = pd.concat([df_test['id'], pd.DataFrame(
    result.to('cpu').numpy(), columns=labels)], axis=1)

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.36 GiB (GPU 0; 6.00 GiB total capacity; 2.54 GiB already allocated; 2.25 GiB free; 2.72 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [18]:
merged

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.571744,0.067052,0.262303,0.032668,0.242250,0.070918
1,0000247867823ef7,0.678830,0.057663,0.300749,0.023797,0.266873,0.056454
2,00013b17ad220c46,0.635623,0.093876,0.322714,0.038011,0.288685,0.072750
3,00017563c3f7919a,0.476595,0.058050,0.176729,0.027210,0.169526,0.063326
4,00017695ad8997eb,0.686566,0.120731,0.377411,0.049431,0.326054,0.088727
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,,,,,,
153160,fffd7a9a6eb32c16,,,,,,
153161,fffda9e8d6fafa9e,,,,,,
153162,fffe8f1340a79fc2,,,,,,
