<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-&amp;-Pretrained-Embeddings" data-toc-modified-id="Data-&amp;-Pretrained-Embeddings-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data &amp; Pretrained Embeddings</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Going-through-the-model" data-toc-modified-id="Going-through-the-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Going through the model</a></span></li></ul></div>

# AG News Classifier with ConvNet
Classifier to classify news titles into categories.

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pdb
import pandas as pd
import numpy as np
import torch

from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from pathlib import Path

from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [3]:
from consts import consts
from ag.data import NewsDataset, DataContainer
from ag.model import NewsClassifier
from ag.pretrained_emb import PretrainedEmbeddings
vars(consts)

{'path': PosixPath('../data/ag_news'),
 'work_dir': PosixPath('../data/ag_news/work_dir'),
 'proc_dataset_csv': PosixPath('../data/ag_news/news_with_splits.csv'),
 'model_dir': PosixPath('../data/ag_news/work_dir/models'),
 'vectorizer_file': PosixPath('../data/ag_news/work_dir/vectorizer.json'),
 'metric_file': PosixPath('../data/ag_news/work_dir/metrics.csv'),
 'cw_file': PosixPath('../data/ag_news/work_dir/class_weights.pth'),
 'use_glove': False,
 'glove_path': PosixPath('../pretrained/glove6B/glove.6B.100d.txt'),
 'embedding_size': 100,
 'hidden_dim': 100,
 'n_channels': 100,
 'dropout_p': 0.1,
 'batch_size': 256,
 'learning_rate': 0.001,
 'num_epochs': 100,
 'device': 'cuda:3',
 'checkpointer_prefix': 'cbow',
 'checkpointer_name': 'classifier',
 'early_stopping_criteria': 5,
 'save_every': 2,
 'save_total': 5}

## Data & Pretrained Embeddings

In [4]:
df = pd.read_csv(consts.proc_dataset_csv)
df.head()

Unnamed: 0,category,split,title
0,Business,train,"Jobs, tax cuts key issues for Bush"
1,Business,train,Jarden Buying Mr. Coffee #39;s Maker
2,Business,train,Retail sales show festive fervour
3,Business,train,Intervoice's Customers Come Calling
4,Business,train,Boeing Expects Air Force Contract


In [5]:
dc = DataContainer(df, NewsDataset, consts.vectorizer_file, consts.batch_size, is_load=True)

try:
  class_weights = torch.load(consts.cw_file)
except FileNotFoundError:
  cat_vocab = dc.cat_vocab
  class_counts = df['category'].value_counts().to_dict()
  sorted_counts = sorted(class_counts.items(), key=lambda x: cat_vocab.lookup_token(x[0]))
  freq = [count for _, count in sorted_counts]
  class_weights = 1.0/torch.tensor(freq, dtype=torch.float32)
  torch.save(class_weights, consts.cw_file)  

In [6]:
pe = PretrainedEmbeddings.from_file(consts.glove_path)
pe.make_custom_embeddings(list(dc.title_vocab.idx_token_bidict.values()))

Loading file...
Finished!


## Model

In [7]:
classifier = NewsClassifier(consts.embedding_size, dc.vocab_size, consts.n_channels, consts.hidden_dim,
                            dc.n_cats, consts.dropout_p)

classifier_pretrained = NewsClassifier(consts.embedding_size, dc.vocab_size, consts.n_channels,
                                       consts.hidden_dim, dc.n_cats, consts.dropout_p,
                                       pretrained=pe.custom_embeddings)

classifier_pretrained_freeze = NewsClassifier(consts.embedding_size, dc.vocab_size, consts.n_channels,
                                              consts.hidden_dim, dc.n_cats, consts.dropout_p,
                                              pretrained=pe.custom_embeddings, freeze_pretrained=True)

loss_fn = nn.CrossEntropyLoss(class_weights)

In [15]:
itr = iter(dc.train_dl)

In [24]:
x,y = next(itr)
y_preds = [classifier(x), classifier_pretrained(x), classifier_pretrained_freeze(x)]
losses = [loss_fn(y_pred,y) for y_pred in y_preds]
losses

[tensor(1.3897, grad_fn=<NllLossBackward>),
 tensor(1.3838, grad_fn=<NllLossBackward>),
 tensor(1.3851, grad_fn=<NllLossBackward>)]

## Training

## Going through the model

In [None]:
emb = nn.Embedding(dc.vocab_size, consts.embedding_size)
conv1s = nn.Sequential(
      nn.Conv1d(in_channels=consts.embedding_size, out_channels=consts.n_channels, kernel_size=3, padding=1),
      nn.ELU())
conv2s = nn.Sequential(nn.Conv1d(in_channels=consts.n_channels, out_channels=consts.n_channels,
         kernel_size=3, stride=2), nn.ELU())
dropout = nn.Dropout(p=consts.dropout_p)
relu = nn.ReLU()
fc1 = nn.Linear(in_features=consts.n_channels, out_features=consts.hidden_dim)
fc2 = nn.Linear(in_features=consts.hidden_dim, out_features=dc.n_cats)
softmax = nn.Softmax(dim=1)

In [None]:
itr = iter(dc.train_dl)

In [None]:
x,y = next(itr)
print(x.shape, y.shape)

In [None]:
print(emb)
t = emb(x)
print(t.shape)
t = t.permute(0,2,1)
print(t.shape)

In [None]:
print(conv1s)
t = conv1s(t)
print(t.shape)

In [None]:
print(conv2s)
t = conv2s(t)
print(t.shape)

In [None]:
print(conv2s)
t = conv2s(t)
print(t.shape)

In [None]:
print(conv1s)
t = conv1s(t)
print(t.shape)

In [None]:
p = F.avg_pool1d(t, 3)
print(p.shape)
torch.all(p==t)

In [None]:
p = p.squeeze(dim=2)
print(p.shape)

In [None]:
print(fc1)
p = fc1(p)
p = dropout(p)
print(p.shape)

In [None]:
print(fc2)
p = fc2(p)
print(p.shape)