In [1]:
from io import StringIO
import json
from google.colab import files
uploaded = files.upload()

Saving own 3.json to own 3.json


In [0]:
import torch
from torchtext import data

In [0]:
TEXT = data.Field(tokenize = 'spacy', lower = True)
LABEL = data.LabelField()

In [0]:
news = data.TabularDataset(
    path='own 3.json', format='json',
    fields={'headline': ('headline', TEXT),
            'short_description' : ('desc', TEXT),
             'category': ('category', LABEL)})

In [0]:
import random
SEED = 1234

trn, vld, tst = news.split(split_ratio=[0.7, 0.1, 0.1], random_state = random.seed(SEED))

In [210]:
vars(trn[0])

{'category': 'POLITICS',
 'desc': ['“',
  'it',
  '’s',
  'mind',
  '-',
  'boggling',
  'they',
  'could',
  'come',
  'up',
  'with',
  'laws',
  'like',
  'that',
  '.',
  '”'],
 'headline': ['this',
  'texas',
  'gun',
  'owner',
  'has',
  'been',
  'an',
  'nra',
  'member',
  'for',
  '46',
  'years',
  '.',
  'now',
  'he',
  "'s",
  'speaking',
  'out',
  '.']}

In [9]:
TEXT.build_vocab(trn, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(trn)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399743/400000 [00:18<00:00, 21087.66it/s]

In [212]:
print(len(TEXT.vocab))
print(len(LABEL.vocab))

1281
20


In [187]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7ff670244b70>, {'POLITICS': 0, 'ENTERTAINMENT': 1, 'HEALTHY LIVING': 2, 'THE WORLDPOST': 3, 'BLACK VOICES': 4, 'COMEDY': 5, 'QUEER VOICES': 6, 'CRIME': 7, 'STYLE': 8, 'TASTE': 9, 'WEIRD NEWS': 10, 'ARTS & CULTURE': 11, 'GREEN': 12, 'WOMEN': 13, 'BUSINESS': 14, 'IMPACT': 15, 'LATINO VOICES': 16, 'PARENTS': 17, 'SPORTS': 18, 'TECH': 19, 'EDUCATION': 0, 'RELIGION': 0, 'MEDIA': 0})


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (trn, vld, tst), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key= lambda x: len(x.headline), 
    sort_within_batch= False
    )

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm_head = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional, dropout = dropout)
        
        self.lstm_desc = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional, dropout = dropout)
        
        self.fc_head = nn.Linear(hidden_dim * 2, 100)
        
        self.fc_desc = nn.Linear(hidden_dim * 2, 100)
        
        self.fc_total = nn.Linear(200, output_dim)
        
        self.dropout = nn.Dropout(dropout)
                
    def forward(self, headline, description):
                        
        embedded_head = self.dropout(self.embedding(headline))
        
        embedded_desc = self.dropout(self.embedding(description))
                                    
        output_head, (hidden_head, cell_head) = self.lstm_head(embedded_head)
        
        output_desc, (hidden_desc, cell_desc) = self.lstm_desc(embedded_desc)
        
        hidden_head = self.dropout(torch.cat((hidden_head[-2, :, :], hidden_head[-1, :, :]), dim = 1))
        
        hidden_desc = self.dropout(torch.cat((hidden_desc[-2, :, :], hidden_desc[-1, :, :]), dim = 1))
        
        full_head = self.fc_head(hidden_head)
        
        full_desc = self.fc_desc(hidden_desc)
        
        hidden_total = torch.cat((full_head, full_desc), 1)
        
        return self.fc_total(hidden_total)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [324]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,855,008 trainable parameters


In [343]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([1281, 100])


In [326]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.3207,  0.8505, -1.2247,  ...,  0.4005, -0.8813,  1.2950],
        [ 0.8659, -0.6052, -0.1966,  ..., -0.3823,  0.6323, -0.8673],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.6897,  1.0285, -1.0856,  ...,  0.5481,  0.8064,  0.5556],
        [ 1.3328,  0.6778, -0.6681,  ...,  0.4397, -0.0898, -1.5322],
        [-0.0346,  0.2088,  0.2014,  ..., -0.6978,  0.3013, -0.5122]])

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                        
        predictions = model(batch.headline, batch.desc).squeeze(1)
        
        loss = criterion(predictions, batch.category)
        
        acc = categorical_accuracy(predictions, batch.category)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            predictions = model(batch.headline, batch.desc).squeeze(1)
            
            loss = criterion(predictions, batch.category)
            
            acc = categorical_accuracy(predictions, batch.category)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [333]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'news_classification_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 2.939 | Train Acc: 18.13%
	 Val. Loss: 2.575 |  Val. Acc: 37.50%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 2.566 | Train Acc: 35.89%
	 Val. Loss: 2.374 |  Val. Acc: 37.50%
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 2.418 | Train Acc: 36.06%
	 Val. Loss: 2.384 |  Val. Acc: 37.50%
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 2.374 | Train Acc: 37.16%
	 Val. Loss: 2.439 |  Val. Acc: 37.50%
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 2.327 | Train Acc: 36.38%
	 Val. Loss: 2.382 |  Val. Acc: 37.50%


In [334]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.665 | Test Acc: 58.82%


In [0]:
import spacy
nlp = spacy.load('en')

def predict_category(model, head, desc):
    model.eval()
    head = head.lower()
    desc = desc.lower()
    tokenized_head = [tok.text for tok in nlp.tokenizer(head)]
    tokenized_desc = [tok.text for tok in nlp.tokenizer(desc)]
    indexed_head = [TEXT.vocab.stoi[t] for t in tokenized_head]
    indexed_desc = [TEXT.vocab.stoi[t] for t in tokenized_desc]
    tensor_head = torch.LongTensor(indexed_head).to(device)
    tensor_desc = torch.LongTensor(indexed_desc).to(device)
    tensor_head = tensor_head.unsqueeze(1)
    tensor_desc = tensor_desc.unsqueeze(1)
    prediction = model(tensor_head, tensor_desc)
    max_pred = prediction.argmax(dim=1)
    return max_pred.item()

In [337]:
pred = predict_category(model, "Trump’s Art Of Distraction", "The conversation surrounding Trump’s latest racist rants has provoked us to revisit author Toni Morrison’s 1975 keynote address at Portland State University on the true purpose of racism..")
print(f'Predicted category is: {pred} = {LABEL.vocab.itos[pred]}')

Predicted category is: 0 = POLITICS


In [338]:
pred = predict_category(model, "Kyle ‘Bugha’ Giersdorf, 16, Wins Fortnite World Cup And Takes Home $ 3 Million Prize", "Fortnite has nearly 250 million registered players and raked in an estimated $2.4 billion last year.")
print(f'Predicted category is: {pred} = {LABEL.vocab.itos[pred]}')

Predicted category is: 0 = POLITICS


In [339]:
import pandas as pd
df = pd.read_json('/content/own 3.json', lines=True)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,ENTERTAINMENT,Jennifer Holliday Won't Perform At Donald Trum...,Paige Lavender,https://www.huffingtonpost.com/entry/jennifer-...,"She apologized to the LGBT community for ""bein...",2017-01-14
1,WEIRD NEWS,Girl Held At Gunpoint Refused To Give Up Chick...,Hilary Hanson,https://www.huffingtonpost.com/entry/gunpoint-...,She reportedly smacked the gun away and told h...,2017-01-14
2,POLITICS,Sephora Is Selling An Eyeshadow Named 'Druggie...,Ryan Grim and Landess Kearns,https://www.huffingtonpost.com/entry/sephora-u...,"In the midst of the nation's drug epidemic, we...",2017-01-14
3,POLITICS,The Obamas Are Ready To Get Out Of The White H...,Mollie Reilly,https://www.huffingtonpost.com/entry/obama-fam...,"Life under constant scrutiny ""has gotten prett...",2017-01-14
4,POLITICS,Undocumented Immigrants Tell Trump They're Not...,Elise Foley,https://www.huffingtonpost.com/entry/immigrati...,Immigrant-rights advocates held more than 70 e...,2017-01-14


In [340]:
pred = predict_category(model, "Freezing of DA insensitive, inhumane", "govt should shelve bullet train, Central Vista projects: Congress")
print(f'Predicted category is: {pred} = {LABEL.vocab.itos[pred]}')

Predicted category is: 0 = POLITICS


In [341]:
pred = predict_category(model, "Julianne Hough steps out with another man amid marital problems with husband Brooks Laich, plus more celeb love life news for late April 2020", "Wonderwall.com is taking a look back at the stars who found love, suffered heartbreak or experienced some other major event in their love lives in late April 2020, starting with this former Dancing With the Stars pro... In early March, there were reports that Julianne Hough and her husband of two years, Brooks Laich, were totally fine after working through a rough patch in their marriage.")
print(f'Predicted category is: {pred} = {LABEL.vocab.itos[pred]}')

Predicted category is: 0 = POLITICS
