In [1]:
import sys
from google.colab import drive
drive.mount('/content/gdrive/')
sys.path.append('/content/gdrive/MyDrive/data')

Mounted at /content/gdrive/


In [2]:
SYSPATH = '/content/gdrive/MyDrive/data/'
WORD_EMBEDDING_FILE = SYSPATH + 'sgns.weibo.word.bz2'

In [3]:
import torch 
import torch.nn as nn

config = {
    'train_file_path': SYSPATH + 'train.csv',
    'test_file_path': SYSPATH + 'test.csv',
    # 10% data are validation set
    'train_val_ratio': 0.1,
    'vocab_size': 30000,
    'batch_size': 64,
    'num_epochs': 10,
    'learning_rate': 1e-3,
    'logging_step': 100,
    'seed': 2021
}

config['device'] = 'gpu' if torch.cuda.is_available() else 'cpu'



import random
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [4]:
# reading the dataset
from collections import Counter
from tqdm import tqdm
import jieba

def get_vocab(config):
  token_counter = Counter()
  with open(config['train_file_path'], 'r', encoding='utf8') as f:
    lines = f.readlines()[1:]
    for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
        sentence = line.split(',')[0].strip()
        # seperate sentence
        sentence_cut = list(jieba.cut(sentence))
        token_counter.update(sentence_cut)
  vocab = [token for token, _ in token_counter.most_common(config['vocab_size'])]
  f.close()
  return vocab

In [5]:
vocab = get_vocab(config)

Counting tokens:   0%|          | 0/180000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.048 seconds.
Prefix dict has been built successfully.
Counting tokens: 100%|██████████| 180000/180000 [00:29<00:00, 6015.63it/s]


In [6]:
# open word embedding
import bz2
with bz2.open(WORD_EMBEDDING_FILE) as f:
  token_vector = f.readlines()

In [7]:
# get word embedding for each word
import random
def get_embedding(vocab):
  token2embedding = {}
  voc_size, dim = token_vector[0].split()
  print(f'{voc_size} tokens in embedding file, vector size is {dim}')
  for line in tqdm(token_vector[1:]):
    line = line.split() 
    token = line[0].decode('utf8')
    vector = line[1:]
    if token in vocab:
      token2embedding[token] = list(map(float, vector))

  # 4 special character
  token2id = {token: id for id, token in enumerate(token2embedding.keys(), 4)}
  id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}
  UNK, PAD, BOS, EOD = '<unk> <pad> <bos> <eos>'.split()
  token2id[PAD] = 0
  token2id[UNK] = 1
  token2id[BOS] = 2
  token2id[EOD] = 3

  id2embedding[0] = [0.] * int(dim)
  id2embedding[1] = [0.] * int(dim)
  id2embedding[2] = [random.uniform(-1, 1)] * int(dim)
  id2embedding[3] = [random.uniform(-1, 1)] * int(dim)
  embedding = [id2embedding[i] for i in range(len(id2embedding))]
  return torch.tensor(embedding, dtype=torch.float), token2id, len(vocab) + 4

In [9]:
embedding, token2id, config['vocab_size'] = get_embedding(vocab)

b'195202' tokens in embedding file, vector size is b'300'


100%|██████████| 195202/195202 [03:05<00:00, 1053.83it/s]


In [8]:
# get tokenized id, use it to find corresponding word embedding
def tokenizer(sentence, token2id):
  ids = [token2id.get(token, 1) for token in jieba.cut(sentence)]
  return ids

# Read dataset

In [10]:
import pandas as pd
from collections import defaultdict

# for training set, we split training set into training and validation set.
def read_data(config, token2id, mode='train'):
  df = pd.read_csv(config[mode + '_file_path'], sep=',')
  if mode == 'train':
    X_train, y_train = defaultdict(list), []
    X_val, y_val = defaultdict(list), []
    num_val = int(config['train_val_ratio'] * len(df))
  else:
    X_test, y_test = defaultdict(list), []

  for i, row in df.iterrows():
    label = row[1] if mode == 'train' else 0
    sentence = row[0]
    inputs = tokenizer(sentence, token2id)
    if mode == 'train':
      if i < num_val:
        X_val['input_ids'].append(inputs)
        y_val.append(label)
      else:
        X_train['input_ids'].append(inputs)
        y_train.append(label)
    else:
      X_test['input_ids'].append(inputs)
      y_test.append(label)

  if mode == 'train':
    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}

    y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)
    return X_train, y_train, X_val, y_val, label2id, id2label
  else:
    y_test = torch.tensor(y_test, dtype=torch.long)
    return X_test, y_test

In [26]:
from torch.utils.data import Dataset
class NEWSDataset(Dataset):
  def __init__(self, X, y):
    self.x = X
    self.y = y
    
  def __getitem__(self, idx):
    return {
        'input_ids': self.x['input_ids'][idx],
        'label': self.y[idx]
        }

  def __len__(self):
    return self.y.size(0)

In [12]:
def collete_fn(examples):
  input_ids_list = []
  labels = []
  for example in examples:
    input_ids_list.append(example['input_ids'])
    labels.append(example['label'])
  # find the longest sentence in input_ids_list
  max_length = max(len(input_ids) for input_ids in input_ids_list)

  # create tensor, dimension = sample_size * max_length
  input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
  # fill tensor 
  for i, input_ids in enumerate(input_ids_list):
    seq_len = len(input_ids)
    input_ids_tensor[i, : seq_len] = torch.tensor(input_ids, dtype=torch.long)
  return {'input_ids': input_ids_tensor,
          'label': torch.tensor(labels, dtype=torch.long)} 


In [13]:
from torch.utils.data import DataLoader

def build_dataloader(config, vocab):
  X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
  X_test, y_test = read_data(config, token2id, mode='test')
  train_dataset = NEWSDataset(X_train, y_train)
  val_dataset = NEWSDataset(X_val, y_val)
  test_dataset = NEWSDataset(X_test, y_test)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collete_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collete_fn)
  test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collete_fn)
  return id2label, train_dataloader, val_dataloader, test_dataloader

In [14]:
id2label, train_dataloader, val_dataloader, test_dataloader = build_dataloader(config, vocab)

  cpuset_checked))


In [15]:
# sentence -> [word1, word2, word3] -> [id1, id2, id3]
for batch in train_dataloader:
  print(batch['input_ids'])
  break

  cpuset_checked))


tensor([[    1,  1324,  1939,  9114, 14634,   597,     1,     0,     0,     0,
             0,     0,     0,     0,     0],
        [22007,  8463,  7599,    90,   262,  2755,   150,   941,     1,     1,
             1,  5836,     0,     0,     0],
        [  179,    68,   895,    98,     1,  2693,  1974,     9,  7402, 23025,
             1,  1169,  7334,     0,     0],
        [ 4822,   744, 24815, 20036,     1,   298,  7613, 17256,  1138, 10636,
             0,     0,     0,     0,     0],
        [ 1361,  6419,   842, 10498, 18963,  5120,     1,  6273, 11225, 22894,
             0,     0,     0,     0,     0],
        [13683,  9036,  3619, 18593,     1, 15112,     1,  8108,  4396,  5048,
          3940,     0,     0,     0,     0],
        [ 7772,    54,     1,    86,  2238,     1, 14412,     9,    31,    65,
            20,     5,   736,     0,     0],
        [    1,     9,  3264,  2990,  2742,   551,  8097,    16,     0,     0,
             0,     0,     0,     0,     0],
        

In [16]:
X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
X_test, y_test = read_data(config, token2id, mode='test')

In [17]:
model_config = {
    'embedding_pretrained' : embedding,
    'num_filters' : 256,
    'emb_size' : embedding.shape[1],
    'dropout' : 0.3,
    'filter_sizes' : [2,3,5],
    'num_classes' : len(label2id)
}

# TextCNN

In [18]:
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, config):
      super(Model, self).__init__()
      # create embedding
      self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'].float(), freeze=True)
      # convolution layer
      self.convs = nn.ModuleList([nn.Conv2d(1, config['num_filters'], (k, config['emb_size'])) for k in config['filter_sizes']])
      # add dropout
      self.dropout = nn.Dropout(config['dropout'])
      # linear layer
      self.fc = nn.Linear(config['num_filters'] * len(config['filter_sizes']), config['num_classes'])

    def convs_and_pool(self, x, conv):

        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, input_ids=None, label=None):
        # out [batch_size, seq_len, embedding_dim]
        out = self.embedding(input_ids)
        
        # H: seq_len; W:embedding_dim
        # out [batch_size, 1, seq_len, embedding_dim]
        out = out.unsqueeze(1)

        # (batch_size, out_channels)
        out = torch.cat([self.convs_and_pool(out, conv) for conv in self.convs], 1)

        out = self.dropout(out)

        out = self.fc(out)

        output = (out, )

        # During training
        if label is not None: 
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(out, label)
            output = (loss, ) + output

        # train output (loss, out)
        # test output (out)
        return output

In [19]:
model = Model(model_config)

# Training model

In [20]:
from torch.optim import AdamW

In [21]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss =0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))
    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['label'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}

            # val output (loss, out)
            loss, logits = model(**batch)[:2]
            val_loss += loss.item()

            preds.append(logits.argmax(dim=-1).detach().cpu())
    
    avg_val_loss = val_loss/len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()

    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='macro')
    precision = precision_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')

    return [avg_val_loss, accuracy, recall, precision, f1]

In [22]:
def train(model, config, id2label, train_dataloader, val_dataloader):
  optimer = AdamW(model.parameters(), lr=config['learning_rate'])
  model.to(config['device'])
  
  global_step = 0
  train_loss = 0
  logging_loss = 0
  accuracys = []

  for epoch in range(config['num_epochs']):
    train_iterator = train_dataloader
    model.train()
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}
      # train output (loss, out)
      loss = model(**batch)[0]
      model.zero_grad()
      loss.backward()
      optimer.step()
      train_loss += loss
      global_step += 1

      if global_step % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss
                result = evaluation(config, model, val_dataloader)
                avg_val_loss, accuracy = result[0], result[1]
                accuracys.append(accuracy)
                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, valid accuracy: {accuracy:.4f}'
                print(print_log)
                model.train()

    return model, accuracys

In [23]:
best_model, accuracys = train(model, config, id2label, train_dataloader, val_dataloader)

  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.12it/s]


>>> training loss: 1.1033, valid loss: 0.6642, valid accuracy: 0.7899


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:19<00:00, 14.47it/s]


>>> training loss: 0.0000, valid loss: 0.5725, valid accuracy: 0.8107


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.25it/s]


>>> training loss: 0.0000, valid loss: 0.5139, valid accuracy: 0.8342


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.28it/s]


>>> training loss: 0.0000, valid loss: 0.5047, valid accuracy: 0.8371


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.35it/s]


>>> training loss: 0.0000, valid loss: 0.4837, valid accuracy: 0.8451


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.16it/s]


>>> training loss: 0.0000, valid loss: 0.4672, valid accuracy: 0.8476


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.10it/s]


>>> training loss: 0.0000, valid loss: 0.4495, valid accuracy: 0.8546


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 14.90it/s]


>>> training loss: 0.0000, valid loss: 0.4478, valid accuracy: 0.8533


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:19<00:00, 14.67it/s]


>>> training loss: 0.0000, valid loss: 0.4350, valid accuracy: 0.8593


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:19<00:00, 14.81it/s]


>>> training loss: 0.0000, valid loss: 0.4271, valid accuracy: 0.8631


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 14.99it/s]


>>> training loss: 0.0000, valid loss: 0.4249, valid accuracy: 0.8617


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.00it/s]


>>> training loss: 0.0000, valid loss: 0.4402, valid accuracy: 0.8550


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.01it/s]


>>> training loss: 0.0000, valid loss: 0.4198, valid accuracy: 0.8665


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.15it/s]


>>> training loss: 0.0000, valid loss: 0.4246, valid accuracy: 0.8626


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.12it/s]


>>> training loss: 0.0000, valid loss: 0.4080, valid accuracy: 0.8649


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.00it/s]


>>> training loss: 0.0000, valid loss: 0.4105, valid accuracy: 0.8662


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.23it/s]


>>> training loss: 0.0000, valid loss: 0.4053, valid accuracy: 0.8675


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.18it/s]


>>> training loss: 0.0000, valid loss: 0.4006, valid accuracy: 0.8676


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.26it/s]


>>> training loss: 0.0000, valid loss: 0.4070, valid accuracy: 0.8645


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.19it/s]


>>> training loss: 0.0000, valid loss: 0.4031, valid accuracy: 0.8680


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.28it/s]


>>> training loss: 0.0000, valid loss: 0.4080, valid accuracy: 0.8671


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.22it/s]


>>> training loss: 0.0000, valid loss: 0.3903, valid accuracy: 0.8734


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:18<00:00, 15.17it/s]


>>> training loss: 0.0000, valid loss: 0.3989, valid accuracy: 0.8689


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:19<00:00, 14.35it/s]


>>> training loss: 0.0000, valid loss: 0.3908, valid accuracy: 0.8722


  cpuset_checked))
Evaluation: 100%|██████████| 282/282 [00:19<00:00, 14.32it/s]


>>> training loss: 0.0000, valid loss: 0.3958, valid accuracy: 0.8697


In [24]:
accuracys

[0.7899444444444444,
 0.8106666666666666,
 0.8342222222222222,
 0.8371111111111111,
 0.8451111111111111,
 0.8476111111111111,
 0.8545555555555555,
 0.8533333333333334,
 0.8593333333333333,
 0.8631111111111112,
 0.8616666666666667,
 0.855,
 0.8665,
 0.8626111111111111,
 0.8649444444444444,
 0.8662222222222222,
 0.8675,
 0.8675555555555555,
 0.8645,
 0.868,
 0.8670555555555556,
 0.8734444444444445,
 0.8688888888888889,
 0.8721666666666666,
 0.8697222222222222]

In [29]:
def predict(config, id2label, model, test_dataloader):
    test_iterator = tqdm(test_dataloader, desc='Evaluation', total=len(test_dataloader))
    model.eval()
    test_preds = []
    with torch.no_grad():
        for batch in test_iterator:
            batch = {item: value.to(config['device']) for item, value in batch.items()}

            logits = model(**batch)[1]

            test_preds.append(logits.argmax(dim=-1).detach().cpu())
    
    test_preds = torch.cat(test_preds, dim=0).numpy()
    test_preds = [id2label[id_] for id_ in test_preds]

    test_df = pd.read_csv(config['test_file_path'], sep=',')
    test_df.insert(1, column='predicted_label', value=test_preds)
    test_df.drop(columns=['sentence'], inplace=True)
    return test_df

In [30]:
prediction = predict(config, id2label, best_model, test_dataloader)

  cpuset_checked))
Evaluation: 100%|██████████| 157/157 [00:09<00:00, 16.08it/s]


In [32]:
accuracy_score(prediction['predicted_label'], prediction['label'])

0.8704

In [33]:
recall_score(prediction['predicted_label'], prediction['label'], average='weighted')

0.8704

In [34]:
precision_score(prediction['predicted_label'], prediction['label'], average='weighted')

0.8753419

In [35]:
f1_score(prediction['predicted_label'], prediction['label'], average='weighted')

0.8707552758742632