In [1]:
import warnings;warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
from torch.utils.data import Dataset,TensorDataset,DataLoader

In [4]:
from tqdm import tqdm_notebook,tnrange

In [5]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from bidict import bidict

In [8]:
from pathlib import Path

In [9]:
path = Path("C:/Users/sappusamy/Documents/SriWK/datasets/bbc/")

In [49]:
classes = ["sport","business","politics"]

In [50]:
texts=[]
labels=[]
for folder in path.iterdir():
    if folder.name in classes:
        for file in folder.iterdir():
            with open(file,'r') as f:
                texts.append(f.read())
                labels.append(folder.name)

In [51]:
len(texts),len(labels)

(1438, 1438)

In [52]:
data = pd.DataFrame({'text':texts,'label':labels})

In [53]:
data = data.sample(frac=1).reset_index(drop=True)

In [54]:
train,remain = train_test_split(data,train_size=0.7)
val,test = train_test_split(remain,train_size=0.5)

In [55]:
len(train),len(val),len(test)

(1006, 216, 216)

In [56]:
# train.to_csv("train.csv",index=False)
# val.to_csv("val.csv",index=False)
# test.to_csv("test.csv",index=False)

In [18]:
train.text = train.text.apply(lambda x: ' '.join(x.lower().split()))
val.text = val.text.apply(lambda x: ' '.join(x.lower().split()))
test.text = test.text.apply(lambda x: ' '.join(x.lower().split()))

In [19]:
chars=set()
for text in train.text:
    chars.update(text)

In [20]:
unk_token='<unk>'
pad_token='<pad>'

In [21]:
char2idx={}
char2idx[pad_token]=0
char2idx[unk_token]=1
for c in set(chars):
    char2idx[c] = len(char2idx)

In [22]:
label2idx = {}
for i in train.label.unique():
    label2idx[i]=len(label2idx)

In [23]:
max_len = 1014

In [24]:
def convert_char2idx(text):
    text = text[:max_len].lower()
    text = [char2idx[c] if c in char2idx else char2idx[unk_token] for c in text]
    if len(text)<max_len:
        text = text + [char2idx[pad_token]]*(max_len-len(text))
    return np.array(text)

In [25]:
def convert_label2idx(label):
    return label2idx[label]

In [26]:
train_x = torch.LongTensor(np.stack(train.text.apply(convert_char2idx)))
train_y = torch.LongTensor(np.stack(train.label.apply(convert_label2idx)))

In [27]:
train_x.shape,train_y.shape

(torch.Size([735, 1014]), torch.Size([735]))

In [28]:
val_x = torch.LongTensor(np.stack(val.text.apply(convert_char2idx)))
val_y = torch.LongTensor(np.stack(val.label.apply(convert_label2idx)))

In [29]:
val_x.shape,val_y.shape

(torch.Size([158, 1014]), torch.Size([158]))

In [30]:
test_x = torch.LongTensor(np.stack(test.text.apply(convert_char2idx)))
test_y = torch.LongTensor(np.stack(test.label.apply(convert_label2idx)))

In [31]:
test_x.shape,test_y.shape

(torch.Size([158, 1014]), torch.Size([158]))

In [32]:
class MyDataset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return self.x.shape[0]
    def __getitem__(self,i):
        return self.x[i],self.y[i]

In [33]:
train_ds = MyDataset(train_x,train_y)

In [34]:
val_ds = MyDataset(val_x,val_y)

In [35]:
test_ds = MyDataset(test_x,test_y)

In [36]:
def to_cuda(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return x

## Model:

In [37]:
class TextClassification(nn.Module):
    def __init__(self,vs=100,emb_dim=16,features=256,hidden_units=1024,num_classes=2,seq_len=1014):
        super().__init__()
        self.emb = nn.Embedding(vs,emb_dim)
        self.model = nn.Sequential(nn.Sequential(nn.Conv1d(16,256,kernel_size=7),nn.ReLU(),nn.MaxPool1d(kernel_size=3)),
                     nn.Sequential(nn.Conv1d(256,256,kernel_size=7),nn.ReLU(),nn.MaxPool1d(kernel_size=3)),
                     nn.Sequential(nn.Conv1d(256,256,kernel_size=3),nn.ReLU()),
                     nn.Sequential(nn.Conv1d(256,256,kernel_size=3),nn.ReLU()),
                     nn.Sequential(nn.Conv1d(256,256,kernel_size=3),nn.ReLU()),
                     nn.Sequential(nn.Conv1d(256,256,kernel_size=3),nn.ReLU(),
                     nn.MaxPool1d(kernel_size=3)))
        L = int((seq_len-96)/27)
        self.fc_layer = nn.Sequential(nn.Linear(L*features,hidden_units),
                                      nn.Linear(hidden_units,hidden_units),
                                      nn.Linear(hidden_units,num_classes))
        self.init_parameters()

        
    def forward(self,x):
        x = self.emb(x).transpose(1,2) # bs,emb_dim,char_len
        x = self.model(x)
        bs,_,_ = x.shape
        x = x.flatten(start_dim=1,end_dim=2)
        logits = self.fc_layer(x)
        return logits
    
    def init_parameters(self):
        for p in self.model.parameters():
            nn.init.normal_(p,mean=0,std=0.05)
        for p in self.fc_layer.parameters():
            nn.init.normal_(p,mean=0,std=0.05)

In [38]:
model = to_cuda(TextClassification(vs=len(char2idx),num_classes=len(label2idx)))

## Training Process:

In [39]:
def accuracy(preds,y):
    return (preds==y).float().mean().item()

In [40]:
epochs=1
bs=4
initial_lr = 1e-4

In [41]:
loss_fn = nn.CrossEntropyLoss()

In [42]:
optimizer = optim.Adam(model.parameters(),lr=initial_lr,weight_decay=1e-5)

In [43]:
class MyScheduler():
    def __init__(self,optimizer,fraction,max_times,interval):
        self.optimizer = optimizer
        self.fraction = fraction
        self.max_times = max_times
        self.interval = interval
        self.step_count = 0
        self.max_count = 0
        
    def step(self):
        self.step_count+=1
        if self.step_count%self.interval==0 and self.max_count<self.max_times:
            for param_group in self.optimizer.param_groups:
                param_group['lr']*=self.fraction
            self.max_count+=1

In [44]:
scheduler = MyScheduler(optimizer,0.5,10,3)

In [45]:
print("{:15} {:15} {:15} {:15}".format("Epochs","Train Loss","Val Loss","Val Accuracy"))
for epoch in tnrange(epochs,desc='Epochs'):
    train_losses = []
    val_losses = []
    train_dl = DataLoader(train_ds,batch_size=bs,shuffle=True)
    t = tqdm_notebook(train_dl,total=len(train_dl),leave=False)
    for x,y in t:
        x,y = to_cuda(x),to_cuda(y)
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits,y)
        loss.backward()
        t.set_postfix(loss=loss.item())
        train_losses.append(loss.item())
        optimizer.step()
    scheduler.step()
        
    train_loss = np.array(train_losses).mean()
    
    val_dl = DataLoader(val_ds,batch_size=bs,shuffle=True)
    val_preds=[]
    val_y=[]
    with torch.no_grad():
        t = tqdm_notebook(val_dl,total=len(val_dl),leave=False)
        for x,y in t:
            x,y = to_cuda(x),to_cuda(y)
            logits = model(x)
            loss = loss_fn(logits,y)
            t.set_postfix(loss=loss.item())
            val_losses.append(loss.item())
            val_preds.append(logits.detach().clone().cpu())
            val_y.append(y.clone().cpu())
            
    val_loss = np.array(val_losses).mean()
    val_acc = accuracy(torch.cat(val_preds).softmax(-1).argmax(-1),torch.cat(val_y))
    
    print("{:15} {:15} {:15} {:15}".format(epoch+1,round(train_loss,4),round(val_loss,4),round(val_acc,4)))
        
        

Epochs          Train Loss      Val Loss        Val Accuracy   


HBox(children=(IntProgress(value=0, description='Epochs', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, max=184), HTML(value='')))

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

              1          3.6481          1.1452          0.4747



In [47]:
torch.cat(val_preds).softmax(-1).argmax(-1)

tensor([0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0,
        0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 2,
        0, 0, 0, 0, 2, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 1, 1, 2, 0, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 0, 0, 2, 0, 1,
        0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [48]:
torch.cat(val_y)

tensor([2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 1, 2, 0, 0, 0, 0,
        0, 0, 2, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 1,
        2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 1, 2, 2, 0, 1, 0, 2, 0, 1,
        2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2,
        0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1,
        2, 2, 2, 0, 1, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2,
        0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0])

In [45]:
x[0][:100]

tensor([ 5, 24,  3, 66,  1, 61, 69, 63, 58,  3, 58, 69, 21, 68,  1, 28,  5, 28,
        66, 68, 63, 58, 68, 58,  1,  3, 63,  1, 66, 31,  4, 10, 68,  1,  1, 68,
         5, 31,  4, 28, 61, 68, 66, 21,  3, 61,  1, 66, 69, 31, 21, 34,  1,  5,
        24,  3, 66,  1, 41, 69, 25, 68,  1, 28,  5, 28, 66, 68, 63, 58, 68, 58,
         1, 69,  1, 61, 69, 63, 58,  3, 58, 69, 21, 68,  1, 27,  4, 31,  1, 69,
        35, 35, 68, 47, 68, 58, 35, 34,  1, 28])