In [2]:
import sys
from google.colab import drive
drive.mount('/content/gdrive/')
sys.path.append('/content/gdrive/MyDrive/data')

Mounted at /content/gdrive/


In [8]:
SYSPATH = '/content/gdrive/MyDrive/data/'
WORD_EMBEDDING_FILE = SYSPATH + 'sgns.weibo.word.bz2'

In [77]:
import torch 
import torch.nn as nn

config = {
    'train_file_path': SYSPATH + 'train.csv',
    'test_file_path': SYSPATH + 'test.csv',
    # 10% data are validation set
    'train_val_ratio': 0.1,
    'vocab_size': 30000,
    'batch_size': 64,
    'num_epochs': 10,
    'learning_rate': 1e-3,
    'logging_step': 100,
    'seed': 2021
}

config['device'] = 'gpu' if torch.cuda.is_available() else 'cpu'



import random
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [6]:
# reading the dataset
from collections import Counter
from tqdm import tqdm
import jieba

def get_vocab(config):
  token_counter = Counter()
  with open(config['train_file_path'], 'r', encoding='utf8') as f:
    lines = f.readlines()[1:]
    for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
        sentence = line.split(',')[-1].strip()
        # seperate sentence
        sentence_cut = list(jieba.cut(sentence))
        token_counter.update(sentence_cut)
  vocab = [token for token, _ in token_counter.most_common(config['vocab_size'])]
  f.close()
  return vocab

In [10]:
vocab = get_vocab(config)

Counting tokens: 100%|██████████| 53360/53360 [00:07<00:00, 7529.02it/s]


In [11]:
# open word embedding
import bz2
with bz2.open(WORD_EMBEDDING_FILE) as f:
  token_vector = f.readlines()

In [17]:
# get word embedding for each word
import random
def get_embedding(vocab):
  token2embedding = {}
  voc_size, dim = token_vector[0].split()
  print(f'{voc_size} tokens in embedding file, vector size is {dim}')
  for line in tqdm(token_vector[1:]):
    line = line.split() 
    token = line[0].decode('utf8')
    vector = line[1:]
    if token in vocab:
      token2embedding[token] = list(map(float, vector))

  # 4 special character
  token2id = {token: id for id, token in enumerate(token2embedding.keys(), 4)}
  id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}
  UNK, PAD, BOS, EOD = '<unk> <pad> <bos> <eos>'.split()
  token2id[PAD] = 0
  token2id[UNK] = 1
  token2id[BOS] = 2
  token2id[EOD] = 3

  id2embedding[0] = [0.] * int(dim)
  id2embedding[1] = [0.] * int(dim)
  id2embedding[2] = [random.uniform(-1, 1)] * int(dim)
  id2embedding[3] = [random.uniform(-1, 1)] * int(dim)
  embedding = [id2embedding[i] for i in range(len(id2embedding))]
  return torch.tensor(embedding, dtype=torch.float), token2id, len(vocab) + 4

In [18]:
embedding, token2id, config['vocab_size'] = get_embedding(vocab)

b'195202' tokens in embedding file, vector size is b'300'


100%|██████████| 195202/195202 [02:45<00:00, 1176.00it/s]


In [19]:
# get tokenized id, use it to find corresponding word embedding
def tokenizer(sentence, token2id):
  ids = [token2id.get(token, 1) for token in jieba.cut(sentence)]
  return ids

# Read dataset

In [28]:
import pandas as pd
from collections import defaultdict

# for training set, we split training set into training and validation set.
def read_data(config, token2id, mode='train'):
  df = pd.read_csv(config[mode + '_file_path'], sep=',')
  if mode == 'train':
    X_train, y_train = defaultdict(list), []
    X_val, y_val = defaultdict(list), []
    num_val = int(config['train_val_ratio'] * len(df))
  else:
    X_test, y_test = defaultdict(list), []

  for i, row in df.iterrows():
    label = row[1] if mode == 'train' else 0
    sentence = row[-1]
    inputs = tokenizer(sentence, token2id)
    if mode == 'train':
      if i < num_val:
        X_val['input_ids'].append(inputs)
        y_val.append(label)
      else:
        X_train['input_ids'].append(inputs)
        y_train.append(label)
    else:
      X_test['input_ids'].append(inputs)
      y_test.append(label)

  if mode == 'train':
    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}

    y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)
    return X_train, y_train, X_val, y_val, label2id, id2label
  else:
    y_test = torch.tensor(y_test, dtype=torch.long)
    return X_test, y_test

In [37]:
from torch.utils.data import Dataset
class NEWSDataset(Dataset):
  def __init__(self, X, y):
    self.x = X
    self.y = y
    
  def __getitem__(self, idx):
    return {
        'input_ids': self.x['input_ids'][idx],
        'label': self.y[idx]
        }

  def __len__(self):
    return self.y.size(0)

In [38]:
def collete_fn(examples):
  input_ids_list = []
  labels = []
  for example in examples:
    input_ids_list.append(example['input_ids'])
    labels.append(example['label'])
  # find the longest sentence in input_ids_list
  max_length = max(len(input_ids) for input_ids in input_ids_list)

  # create tensor, dimension = sample_size * max_length
  input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
  # fill tensor 
  for i, input_ids in enumerate(input_ids_list):
    seq_len = len(input_ids)
    input_ids_tensor[i, : seq_len] = torch.tensor(input_ids, dtype=torch.long)
  return {'input_ids': input_ids_tensor,
          'label': torch.tensor(labels, dtype=torch.long)} 


In [39]:
from torch.utils.data import DataLoader

def build_dataloader(config, vocab):
  X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
  X_test, y_test = read_data(config, token2id, mode='test')
  train_dataset = NEWSDataset(X_train, y_train)
  val_dataset = NEWSDataset(X_val, y_val)
  test_dataset = NEWSDataset(X_test, y_test)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collete_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collete_fn)
  test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collete_fn)
  return id2label, train_dataloader, val_dataloader, test_dataloader

In [40]:
id2label, train_dataloader, val_dataloader, test_dataloader = build_dataloader(config, vocab)

  cpuset_checked))


In [41]:
# sentence -> [word1, word2, word3] -> [id1, id2, id3]
for batch in train_dataloader:
  print(batch['input_ids'])
  break

  cpuset_checked))


tensor([[24721,  2610, 20301,  ...,     0,     0,     0],
        [    1,     1, 17923,  ...,     0,     0,     0],
        [10417,    13,    29,  ...,     0,     0,     0],
        ...,
        [  348,    68,  3295,  ...,     0,     0,     0],
        [    1,   102, 17883,  ...,     0,     0,     0],
        [14909,  7163,   389,  ...,     0,     0,     0]])


In [None]:
X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
X_test, y_test = read_data(config, token2id, mode='test')

In [42]:
model_config = {
    'embedding_pretrained' : embedding,
    'num_filters' : 256,
    'emb_size' : embedding.shape[1],
    'dropout' : 0.3,
    'filter_sizes' : [2,3,5],
    'num_classes' : len(label2id)
}

# TextCNN

In [63]:
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, config):
      super(Model, self).__init__()
      # create embedding
      self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'].float(), freeze=True)
      # convolution layer
      self.convs = nn.ModuleList([nn.Conv2d(1, config['num_filters'], (k, config['emb_size'])) for k in config['filter_sizes']])
      # add dropout
      self.dropout = nn.Dropout(config['dropout'])
      # linear layer
      self.fc = nn.Linear(config['num_filters'] * len(config['filter_sizes']), config['num_classes'])

    def convs_and_pool(self, x, conv):

        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, input_ids=None, label=None):
        # out [batch_size, seq_len, embedding_dim]
        out = self.embedding(input_ids)
        
        # H: seq_len; W:embedding_dim
        # out [batch_size, 1, seq_len, embedding_dim]
        out = out.unsqueeze(1)

        # (batch_size, out_channels)
        out = torch.cat([self.convs_and_pool(out, conv) for conv in self.convs], 1)

        out = self.dropout(out)

        out = self.fc(out)

        output = (out, )

        # During training
        if label is not None: 
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(out, label)
            output = (loss, ) + output

        # train output (loss, out)
        # test output (out)
        return output

In [64]:
model = Model(model_config)

# Training model

In [47]:
from torch.optim import AdamW

In [66]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss =0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))
    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['label'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}

            # val output (loss, out)
            loss, logits = model(**batch)[:2]
            val_loss += loss.item()

            preds.append(logits.argmax(dim=-1).detach().cpu())
    
    avg_val_loss = val_loss/len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()

    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='macro')
    precision = precision_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')

    return [avg_val_loss, accuracy, recall, precision, f1]

In [80]:
def train(model, config, id2label, train_dataloader, val_dataloader):
  optimer = AdamW(model.parameters(), lr=config['learning_rate'])
  model.to(config['device'])
  
  global_step = 0
  train_loss = 0
  logging_loss = 0
  accuracys = []

  for epoch in range(config['num_epochs']):
    train_iterator = train_dataloader
    model.train()
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}
      # train output (loss, out)
      loss = model(**batch)[0]
      model.zero_grad()
      loss.backward()
      optimer.step()
      train_loss += loss
      global_step += 1

      if global_step % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss
                result = evaluation(config, model, val_dataloader)
                avg_val_loss, accuracy = result[0], result[1]
                accuracys.append(accuracy)
                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, valid accuracy: {accuracy:.4f}'
                print(print_log)
                model.train()

    return model, accuracys

In [81]:
best_model, accuracys = train(model, config, id2label, train_dataloader, val_dataloader)

  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:12<00:00,  6.78it/s]


>>> training loss: 0.9816, valid loss: 1.4050, valid accuracy: 0.5309


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:07<00:00, 11.35it/s]


>>> training loss: 0.0000, valid loss: 1.4001, valid accuracy: 0.5305


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:07<00:00, 11.38it/s]


>>> training loss: 0.0000, valid loss: 1.3973, valid accuracy: 0.5264


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:07<00:00, 10.68it/s]


>>> training loss: 0.0000, valid loss: 1.3989, valid accuracy: 0.5347


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:07<00:00, 10.53it/s]


>>> training loss: 0.0000, valid loss: 1.3812, valid accuracy: 0.5345


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:08<00:00, 10.12it/s]


>>> training loss: 0.0000, valid loss: 1.3700, valid accuracy: 0.5388


  cpuset_checked))
Evaluation: 100%|██████████| 84/84 [00:07<00:00, 11.23it/s]


>>> training loss: 0.0000, valid loss: 1.3931, valid accuracy: 0.5266


In [82]:
accuracys

[0.5309220389805097,
 0.5305472263868066,
 0.526424287856072,
 0.5346701649175413,
 0.5344827586206896,
 0.5387931034482759,
 0.5266116941529235]

In [83]:
def predict(config, id2label, model, test_dataloader):
    test_iterator = tqdm(test_dataloader, desc='Evaluation', total=len(test_dataloader))
    model.eval()
    test_preds = []
    with torch.no_grad():
        for batch in test_iterator:
            batch = {item: value.to(config['device']) for item, value in batch.items()}

            logits = model(**batch)[1]

            test_preds.append(logits.argmax(dim=-1).detach().cpu())
    
    test_preds = torch.cat(test_preds, dim=0).numpy()
    test_preds = [id2label[id_] for id_ in test_preds]

    test_df = pd.read_csv(config['test_file_path'], sep=',')
    test_df.insert(1, column='label', value=test_preds)
    test_df.drop(columns=['sentence'], inplace=True)
    test_df.to_csv(SYSPATH  + 'prediction.csv', index=False, encoding='utf8')
    return test_df

In [95]:
prediction = predict(config, id2label, best_model, test_dataloader)
actual = pd.read_csv(SYSPATH  + 'solution.csv')

  cpuset_checked))
Evaluation: 100%|██████████| 157/157 [00:18<00:00,  8.67it/s]


In [96]:
accuracy_score(actual['label'], prediction['label'])

0.5398

In [97]:
recall_score(actual['label'], prediction['label'], average='weighted')

0.5398

In [98]:
precision_score(actual['label'], prediction['label'], average='weighted')

0.551366540671474

In [99]:
f1_score(actual['label'], prediction['label'], average='weighted')

0.5403641114111613