# News Classification
This notebook demonstrates how to classify news with Bidirectional LSTM

### 

In [None]:
import re

!wget https://github.com/stopwords-iso/stopwords-bn/blob/master/stopwords-bn.txt

# since we are classifying, tokens like stopwords, punctuations don't really have a meaning
# so we will remove all of these
def clean_and_tokenize(txt):
    txt = re.sub(r'[^\u0900-\u0A7F\u0020-u003F\u2000-\u206F]', '', txt) 
    txt = re.sub(r'[a-zA-Z0-9]', '', txt)
    txt = re.sub(r'<.*?>', '', txt)
    txt = re.sub(r'।+', '। ', txt) 
        
    chars = ['/', ';',
             '—', 
             '=', '%',
             '>', '<',
             '_', '…',
             '–', '*', '~',
             '}', '{',
             "\\", 
             '[', ']',
             '#', '+',
             '∗', '&', '|',
             '`', '@', '^',
             '$', '•']
    for char in chars:
        txt = txt.replace(char, ' ')
    
    # load the stop words
    with open('stopwords-bn.txt', 'r') as f:
        stopwords = f.readlines()
    
    for word in stopwords:
        txt = txt.replace(word, ' ')
    
    txt = txt.replace('\u200c', '')
    txt = txt.replace('\u200d', '')
    txt = txt.replace(' . ', ' ')
    txt = txt.replace('.', ' ')
    txt = re.sub(r'\([^)]*\)', ' ', txt)
    txt = txt.replace('‘‘', ' ') 
    txt = txt.replace('’’', ' ') 
    txt = txt.replace('‘', " ")
    txt = txt.replace('’', " ")
    txt = txt.replace('“', ' ')
    txt = txt.replace('”', ' ')
    txt = txt.replace('"', ' ')    
    txt = txt.replace("'", ' ')
    txt = re.sub(r'।',' ',txt)
    txt = re.sub(r',',' ',txt)
    txt = re.sub(r'-',' ',txt)
    txt = txt.replace('?', ' ')
    txt = txt.replace('!', ' ')
    txt = txt.replace(':', ' ')
    txt = re.sub(r' +',' ',txt)
    txt = txt.strip() 
     
    return txt

### Process the dataset

In [None]:
import pandas as pd
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
import numpy as np

data = pd.read_csv('../input/bdnews24-corpus/bdnews24.csv', encoding='utf-8', index_col=None)

In [None]:
np.unique(data['en_category'].values)

In [None]:
# let's cherry pick some categories
cats = {
    'bangladesh': 0,
    'business': 1,
    'cricket': 2,
    'economy': 3,
    'politics': 4,
    'tech': 5,
    'sport': 6,
    'world': 7,
   
}

In [None]:
for cat in cats.keys():
    print(f'{cat}: {len(data[data.en_category == cat])}')

In [None]:
# only select the `contents` & `en_category` column
filtered = data[data.en_category.isin(cats)]

# reset the indices 
filtered = filtered.reset_index()[['contents', 'en_category']]

filtered

In [None]:
%%time
from sklearn.model_selection import train_test_split

x = filtered['contents']
y = filtered['en_category']

train_x, _, train_y, _ = train_test_split(x, y, stratify=y, test_size=0.95)

# clean the text
train_x = train_x.apply(lambda x: clean_and_tokenize(str(x)))

# convert the categories to ids
train_y = train_y.apply(lambda x: cats[x])

# first split the train dataset into (train, rest) set
train_x, rest_x, train_y, rest_y = train_test_split(train_x, train_y, stratify=train_y, test_size=0.2, random_state=747)

# then split the rest into (test, valid) set
test_x, valid_x, test_y, valid_y = train_test_split(rest_x, rest_y, stratify=rest_y, test_size=0.5, random_state=747)

print(len(test_y), len(valid_y), len(train_y))

In [None]:
np.bincount(test_y)

In [None]:
# now save the splits 

train_df = pd.concat([train_x, train_y], axis=1).reset_index()[['en_category', 'contents']]
test_df = pd.concat([test_x, test_y], axis=1).reset_index()[['en_category', 'contents']]
valid_df = pd.concat([valid_x, valid_y], axis=1).reset_index()[['en_category', 'contents']]

train_df.to_csv('train.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)
test_df.to_csv('test.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)
valid_df.to_csv('valid.csv', header=['classlabel', 'content'], encoding='utf-8', index=False)

In [None]:
!ls

## Training

In [None]:
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random
import pandas as pd
import numpy as np

torch.backends.cudnn.deterministic = True

In [None]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 60_000
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 200
BIDIRECTIONAL = True
HIDDEN_DIM = 200
NUM_LAYERS = 2
OUTPUT_DIM = 8

In [None]:
import math
def remove_exponent(d):
    """Remove exponent."""
    return d.quantize(Decimal(1)) if d == d.to_integral() else d.normalize()


def millify(n, precision=0, drop_nulls=True, prefixes=[]):
    """Humanize number."""
    millnames = ['', 'k', 'M', 'B', 'T', 'P', 'E', 'Z', 'Y']
    if prefixes:
        millnames = ['']
        millnames.extend(prefixes)
    n = float(n)
    millidx = max(0, min(len(millnames) - 1,
                         int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3))))
    result = '{:.{precision}f}'.format(n / 10**(3 * millidx), precision=precision)
    if drop_nulls:
        result = remove_exponent(Decimal(result))
    return '{0}{dx}'.format(result, dx=millnames[millidx])

In [None]:
TEXT = data.Field(sequential=True,
#                   tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence

LABEL = data.LabelField(dtype=torch.float)

In [None]:
fields = [('classlabel', LABEL), ('content', TEXT)]

train_dataset = data.TabularDataset(
    path="train.csv", format='csv',
    skip_header=True, fields=fields)

test_dataset = data.TabularDataset(
    path="test.csv", format='csv',
    skip_header=True, fields=fields)

valid_dataset = data.TabularDataset(
    path="valid.csv", format='csv',
    skip_header=True, fields=fields)

In [None]:
print(f'Num Train: {len(train_dataset)}')
print(f'Num Valid: {len(test_dataset)}')
print(f'Num Valid: {len(valid_dataset)}')

In [None]:
TEXT.build_vocab(train_dataset, test_dataset, valid_dataset,
                 min_freq=2)
LABEL.build_vocab(train_dataset)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

In [None]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    sort_key=lambda x: len(x.content),
    device=DEVICE)

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break

In [None]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, bidirectional, hidden_dim, num_layers, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=num_layers,
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * num_layers, 64)
        self.fc2 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_length):

        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden

In [None]:
INPUT_DIM = len(TEXT.vocab)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, BIDIRECTIONAL, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(DEVICE)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.content
            if 0 in text_lengths:
                continue
            logits = model(text, text_lengths.to('cpu'))
            _, predicted_labels = torch.max(logits, 1)
            num_examples += batch_data.classlabel.size(0)
            correct_pred += (predicted_labels.long() == batch_data.classlabel.long()).sum()
        return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.content
        # print(text.shape, text_lengths.shape)
        if 0 in text_lengths:
            continue
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths.to('cpu'))
        cost = F.cross_entropy(logits, batch_data.classlabel.long())
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

## Inference

In [None]:
cats = {
    'bangladesh': 0,
    'business': 1,
    'cricket': 2,
    'economy': 3,
    'politics': 4,
    'tech': 5,
    'sport': 6,
    'world': 7,
   
}

map_dict = {v: k for k, v in cats.items()}

def predict(model, sentence, device='cpu'):
    model.eval()
    indexed = [TEXT.vocab.stoi[token] for token in clean_and_tokenize(sentence).split()]
#     indexed = [TEXT.vocab.stoi[i] for i in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    
    output = model(tensor, length_tensor)
    predictions = torch.softmax(output, dim=1)
    
    probs, label = predictions.max(dim=1)
    
    return predictions, probs.item(), label.item()
    

In [None]:
# https://www.prothomalo.com/bangladesh/%E0%A6%86%E0%A6%A8%E0%A6%BF%E0%A6%B8%E0%A7%81%E0%A6%B2-%E0%A6%B9%E0%A6%A4%E0%A7%8D%E0%A6%AF%E0%A6%BE-%E0%A6%AE%E0%A6%BE%E0%A6%AE%E0%A6%B2%E0%A6%BE%E0%A7%9F-%E0%A6%86%E0%A6%97%E0%A6%BE%E0%A6%AE-%E0%A6%9C%E0%A6%BE%E0%A6%AE%E0%A6%BF%E0%A6%A8-%E0%A6%AA%E0%A7%87%E0%A6%B2%E0%A7%87%E0%A6%A8-%E0%A6%8F%E0%A6%95-%E0%A6%9A%E0%A6%BF%E0%A6%95%E0%A6%BF%E0%A7%8E%E0%A6%B8%E0%A6%95
news = """
সিনিয়র সহকারী পুলিশ সুপার (এএসপি) আনিসুল করিমকে হত্যার অভিযোগে করা মামলায় চিকিৎসক নুসরাত ফারজানা আগাম জামিন পেয়েছেন। আগামী ৫ জানুয়ারি পর্যন্ত তাঁকে জামিন দেওয়া হয়েছে।
জামিন চেয়ে তাঁর করা আবেদনের শুনানি নিয়ে আজ বুধবার বিচারপতি হাবিবুল গনি ও বিচারপতি মো. রিয়াজ উদ্দিন খানের সমন্বয়ে গঠিত হাইকোর্ট বেঞ্চ এ আদেশ দেন। ওই সময়ের মধ্যে ঢাকার চিফ মেট্রোপলিটন ম্যাজিস্ট্রেট আদালতে জামিননামা দিয়ে মেট্রোপলিটন সেশন জজ আদালতে আত্মসমর্পণ করতে বলা হয়।
আদালতে উপস্থিত হয়ে আইনজীবীর মাধ্যমে আজ আগাম জামিনের আরজি জানান নুসরাত ফারজানা। আদালতে তাঁর পক্ষে শুনানি করেন আইনজীবী রুহুল কুদ্দুস। রাষ্ট্রপক্ষে শুনানি করেন সহকারী অ্যাটর্নি জেনারেল মাহফুজুর রহমান লিখন।

"""
preds, probs, label = predict(model, news, 'cuda')

print(f'Class Label: {label} -> {map_dict[label]}')
# print(torch.)