In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AutoModel, AutoTokenizer 
from transformers import BertTokenizer, RobertaTokenizer,RobertaForSequenceClassification

import random
import time
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
train_df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
test_df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")

In [4]:
train_df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [5]:
len(train_df['Category'].unique())

5

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [7]:
train_df['len'] = train_df['Text'].apply(lambda x: len(x))
train_df['len'].mean()

2233.461744966443

In [8]:
# Seed 설정
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device.type

'cuda'

In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['Category'].values)

LabelEncoder()

In [10]:
!nvidia-smi

Tue Nov 29 08:37:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    43W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
temp = label_encoder.transform(train_df['Category'].values.reshape(-1, 1))

In [None]:
'''
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
print(config.__class__)
print(config)
'''

In [11]:
class BBCNEWS(Dataset):
    def __init__(self, max_seq_len, label_enc,test=False):
        self.test = test
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.max_seq_len = max_seq_len
        self.enc = label_enc
        
        if self.test:
            self.df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")
            self.txt = self.df['Text'].values
            self.id = self.df['ArticleId'].values
            
        else:
            self.df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
            self.txt = self.df['Text'].values
            self.id = self.df['ArticleId'].values
            self.label = F.one_hot(torch.tensor(self.enc.transform(self.df['Category'].values)).long())
    
    def __len__(self):
        return len(self.txt)
    
    def __getitem__(self, idx):
        tokenized_text = self.tokenizer(self.txt[idx],
                     padding= 'max_length',
                     max_length=self.max_seq_len,
                     truncation=True,
                     return_tensors = "pt")
        ids = tokenized_text['input_ids']
        mask = tokenized_text['attention_mask']
        tokenized_dict = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)}
        if self.test:
            return tokenized_dict
        else:
            return tokenized_dict, self.label[idx]

In [12]:
class MODEL(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModel.from_pretrained("roberta-base")
        self.fc1= nn.Linear(in_features=768, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=5)
        self.relu = nn.ReLU()
        
    def forward(self, ids, mask):

        outputs = self.model(ids, attention_mask = mask)
        outputs = self.fc1(outputs.last_hidden_state[:, 0])
        outputs = self.relu(outputs)
        outputs = self.fc2(outputs)
        return outputs

In [14]:
BATCH_SIZE = 16
LEARNING_RATE = 1e-5
EPOCHS = 50

In [15]:
dataset_train = BBCNEWS(max_seq_len=512, 
                        label_enc = label_encoder,
                        test=False)
dataset_test = BBCNEWS(max_seq_len=512,
                        label_enc = label_encoder,
                        test=True)
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE,num_workers=0, pin_memory=True, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE,num_workers=0, pin_memory=True,shuffle=False)

In [None]:
# load model
cls_model = MODEL().to(device)
cross_entropy_loss = nn.CrossEntropyLoss().to(device)
# set optimizer
optim = Adam(params=cls_model.parameters(), lr=LEARNING_RATE)

In [None]:
## Train
losses = []
scaler = torch.cuda.amp.GradScaler()
for i in range(1, EPOCHS+1):
    epoch_loss = 0
    for data, label in tqdm(train_loader):
        # set zero_grad
        optim.zero_grad()
        with torch.cuda.amp.autocast(dtype=torch.float16):
        # Casts operations to mixed precision
            batch_output = cls_model(data['ids'].squeeze(1).to(device), data['mask'].to(device))
            loss = cross_entropy_loss(batch_output, label.float().to(device))
        # data, label split
        scaler.scale(loss).backward()
        # forward
        scaler.step(optim)
        
        # loss
        scaler.update()
        # append loss & backward
        epoch_loss += loss.item()
    epoch_loss /= len(train_loader)
    print(f"Epoch {i}   Loss : {epoch_loss}")

In [None]:
torch.save(cls_model, f'./model.pt')

In [16]:
# Inference
model = torch.load("model.pt", map_location=device)
model.eval()
preds = []
for batch in test_loader:
    attention_mask = batch['mask'].to(device)
    input_ids = batch['ids'].squeeze(1).to(device)
    batch_output = model(input_ids, attention_mask)
    pred = [output.argmax().detach().cpu().item() for output in batch_output]
    preds = preds + pred
preds = label_encoder.inverse_transform(preds)



In [18]:
sub = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv")

In [20]:
test_df.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [21]:
sub['Category'] = preds

In [23]:
sub.to_csv("submission.csv", index=False, header=True)

In [17]:
print(preds)

['sport' 'tech' 'sport' 'business' 'sport' 'sport' 'politics' 'politics'
 'entertainment' 'business' 'business' 'tech' 'politics' 'tech'
 'entertainment' 'sport' 'politics' 'tech' 'entertainment' 'entertainment'
 'business' 'politics' 'sport' 'business' 'politics' 'sport' 'business'
 'sport' 'sport' 'business' 'politics' 'tech' 'business' 'business'
 'sport' 'sport' 'sport' 'business' 'entertainment' 'entertainment' 'tech'
 'politics' 'entertainment' 'tech' 'sport' 'tech' 'entertainment'
 'business' 'politics' 'business' 'politics' 'business' 'business'
 'business' 'tech' 'politics' 'tech' 'entertainment' 'sport' 'tech'
 'sport' 'entertainment' 'tech' 'politics' 'business' 'entertainment'
 'sport' 'tech' 'sport' 'sport' 'business' 'sport' 'business' 'politics'
 'tech' 'sport' 'tech' 'tech' 'tech' 'entertainment' 'politics' 'sport'
 'entertainment' 'entertainment' 'business' 'entertainment' 'business'
 'entertainment' 'business' 'tech' 'business' 'politics' 'sport' 'tech'
 'sport' 'spor

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion", problem_type="multi_label_classification")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
tok = BertTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")