In [None]:
!pip install transformers==3.0.0

In [None]:
import torch
import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import AutoModel, BertTokenizerFast

In [None]:
torch.cuda.is_available()

# Extracting Data and combining headline and content

In [None]:
df = pd.read_csv('../input/chinese-official-daily-news-since-2016/chinese_news.csv')
df['text'] = df['headline'] + '. '+ df['content']
df = df[['text','tag']]
df['tag'] = df['tag'].map({'国内' : 0, '国际' : 1, '详细全文' : 2})
df.dropna()
df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['tag'],
                                                    stratify=df['tag'])

# Preparing Data for Model

In [None]:
bert = AutoModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

In [None]:
train_idx = x_train.dropna().index
test_idx = x_test.dropna().index

train_tokens = tokenizer.batch_encode_plus(x_train[train_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)
test_tokens = tokenizer.batch_encode_plus(x_test[test_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)

In [None]:
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(y_train[train_idx].to_list())

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_y = torch.tensor(y_test[test_idx].to_list())

In [None]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

In [None]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
trainloader = DataLoader(train_data, 
                         sampler = train_sampler,
                         batch_size = 32)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = RandomSampler(test_data)
testloader = DataLoader(test_data, 
                         sampler = test_sampler,
                         batch_size = 32)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

# Making our Model

In [None]:
from torch import nn
from transformers import AdamW
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.fc1 = nn.Linear(768,3)
    
    def forward(self, sent_id, mask):
        _ , cls_hs = self.bert(sent_id, attention_mask = mask)
        return F.log_softmax(self.fc1(cls_hs), dim = 1)

In [None]:
model = BertClassifier(bert)
model = model.cuda()

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [None]:
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights

In [None]:
weights = torch.tensor(class_weights, dtype = torch.float)
weights = weights.cuda()

criterion = nn.NLLLoss(weight = weights)

# Fine Tuning our model

In [None]:
from tqdm.notebook import tqdm

In [None]:
epochs = 10

for e in range(epochs):   
    train_loss = 0.0
    for batch in tqdm(trainloader):
        batch = [i.cuda() for i in batch]
        sent_id, masks, labels = batch

        optimizer.zero_grad()
        preds = model(sent_id, masks)
        loss = criterion(preds, labels)
        train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
    print(f'Epoch:{e+1}\t\tTraining Loss: {train_loss / len(trainloader)}')

In [None]:
pred_label = []
true_label = []
for batch in tqdm(testloader):
    batch = [i.cuda() for i in batch]
    sent_id, masks, labels = batch

    preds = model(sent_id, masks)
    pred_label.extend(torch.argmax(preds, axis = 1).cpu())
    true_label.extend(labels.cpu())

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(true_label, pred_label)

In [None]:
print(classification_report(true_label, pred_label))