<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-&amp;-Model" data-toc-modified-id="Data-&amp;-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data &amp; Model</a></span></li><li><span><a href="#dc-=-DataContainer(df,-NewsDataset,-consts.vectorizer_file,-consts.batch_size,-is_load=False)" data-toc-modified-id="dc-=-DataContainer(df,-NewsDataset,-consts.vectorizer_file,-consts.batch_size,-is_load=False)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>dc = DataContainer(df, NewsDataset, consts.vectorizer_file, consts.batch_size, is_load=False)</a></span></li></ul></div>

# AG News Classifier with ConvNet
Classifier to classify news titles into categories.

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pdb
import pandas as pd
import numpy as np
import torch

from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [3]:
from consts import consts
from ag.data import NewsDataset, DataContainer
consts

Namespace(batch_size=256, checkpointer_name='classifier', checkpointer_prefix='cbow', cw_file=PosixPath('../data/ag_news/work_dir/class_weights.pth'), device='cuda:3', dropout_p=0.1, early_stopping_criteria=5, embedding_size=100, glove_path=PosixPath('../pretrained_path/glove6B/glove.6B.100d.txt'), hidden_dim=100, learning_rate=0.001, metric_file=PosixPath('../data/ag_news/work_dir/metrics.csv'), model_dir=PosixPath('../data/ag_news/work_dir/models'), n_channels=100, num_epochs=100, path=PosixPath('../data/ag_news'), proc_dataset_csv=PosixPath('../data/ag_news/news_with_splits.csv'), save_every=2, save_total=5, use_glove=False, vectorizer_file=PosixPath('../data/ag_news/work_dir/vectorizer.json'), work_dir=PosixPath('../data/ag_news/work_dir'))

## Data & Model

In [4]:
df = pd.read_csv(consts.proc_dataset_csv)
df.head()

Unnamed: 0,category,split,title
0,Business,train,"Jobs, tax cuts key issues for Bush"
1,Business,train,Jarden Buying Mr. Coffee #39;s Maker
2,Business,train,Retail sales show festive fervour
3,Business,train,Intervoice's Customers Come Calling
4,Business,train,Boeing Expects Air Force Contract


In [5]:
dc = DataContainer(df, NewsDataset, consts.vectorizer_file, consts.batch_size, is_load=True)

try:
  class_weights = torch.load(consts.cw_file)
except FileNotFoundError:
  cat_vocab = dc.cat_vocab
  class_counts = df['category'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: cat_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, consts.cw_file)  

Creating and saving vectorizer


In [6]:
class NewsClassifier(nn.Module):
  def __init__(self, emb_sz, vocab_size, n_channels, hidden_dim, n_classes, dropout_p,
               pretrained=None, padding_idx=0):
    super(NewsClassifier, self).__init__()
    
    if pretrained:
      pass
    else:
      self.emb = nn.Embedding(vocab_size, emb_sz, padding_idx)
      
    self.convnet = nn.Sequential(
      nn.Conv1d(in_channels=emb_sz, out_channels=n_channels, kernel_size=3),
      nn.ELU(),
      nn.Conv1d(in_channels=n_channels, out_channels=n_channels, kernel_size=3, stride=2),
      nn.ELU(),
      nn.Conv1d(in_channels=n_channels, out_channels=n_channels, kernel_size=3, stride=2),
      nn.ELU(),
      nn.Conv1d(in_channels=n_channels, out_channels=n_channels, kernel_size=3),
      nn.ELU()
    )
    
    self.dropout = nn.Dropout(p=dropout_p)
    self.relu = nn.ReLU()
    self.fc1 = nn.Linear(in_features=n_channels, out_features=hidden_dim)
    self.fc2 = nn.Linear(in_features=hidden_dim, out_features=n_classes)
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, x_in, apply_softmax=False):
    # embed and permute so features are channels
    x_emb = self.emb(x_in).permute(0,2,1)
    features = self.convnet(x_emb)
    
    # average and remove extra dimension
    remaining_size = features.size(dim=2)
    features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
    features = self.dropout(features)
    
    # mlp classifier
    hidden_vector = self.fc1(features)
    hidden_vector = self.dropout(hidden_vector)
    hidden_vector = self.relu(hidden_vector)
    prediction_vector = self.fc2(hidden_vector)
    
    if apply_softmax:
      prediction_vector = self.softmax(prediction_vector)
      
    return prediction_vector

In [7]:
classifier = NewsClassifier(consts.embedding_size, dc.vocab_size, consts.n_channels, consts.hidden_dim,
                            dc.n_cats, consts.dropout_p)
loss_fn = nn.CrossEntropyLoss(class_weights)
classifier

NewsClassifier(
  (emb): Embedding(3566, 100, padding_idx=0)
  (convnet): Sequential(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (dropout): Dropout(p=0.1)
  (relu): ReLU()
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
  (softmax): Softmax()
)

In [8]:
itr = iter(dc.train_dl)

In [9]:
x,y = next(itr)