# Text (News) Classification with Pytorch and Deep Learning

In [19]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('news_data'):
    os.mkdir('news_data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='news_data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ag_news_csv.tar.gz: 11.8MB [00:00, 13.4MB/s]
120000lines [00:07, 15367.60lines/s]
120000lines [00:13, 8695.06lines/s]
7600lines [00:00, 8646.11lines/s]


In [8]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [9]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [10]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [11]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

In [12]:
def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [13]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 24 seconds
	Loss: 0.0262(train)	|	Acc: 84.7%(train)
	Loss: 0.0001(valid)	|	Acc: 90.6%(valid)
Epoch: 2  | time in 0 minutes, 24 seconds
	Loss: 0.0119(train)	|	Acc: 93.7%(train)
	Loss: 0.0002(valid)	|	Acc: 90.8%(valid)
Epoch: 3  | time in 0 minutes, 24 seconds
	Loss: 0.0070(train)	|	Acc: 96.3%(train)
	Loss: 0.0002(valid)	|	Acc: 90.8%(valid)
Epoch: 4  | time in 0 minutes, 23 seconds
	Loss: 0.0039(train)	|	Acc: 98.0%(train)
	Loss: 0.0002(valid)	|	Acc: 91.0%(valid)
Epoch: 5  | time in 0 minutes, 23 seconds
	Loss: 0.0023(train)	|	Acc: 99.0%(train)
	Loss: 0.0002(valid)	|	Acc: 91.2%(valid)
Epoch: 6  | time in 0 minutes, 23 seconds
	Loss: 0.0015(train)	|	Acc: 99.4%(train)
	Loss: 0.0002(valid)	|	Acc: 91.3%(valid)
Epoch: 7  | time in 0 minutes, 24 seconds
	Loss: 0.0011(train)	|	Acc: 99.6%(train)
	Loss: 0.0002(valid)	|	Acc: 91.4%(valid)
Epoch: 8  | time in 0 minutes, 23 seconds
	Loss: 0.0008(train)	|	Acc: 99.7%(train)
	Loss: 0.0002(valid)	|	Acc: 91.2%(valid)
Epoch: 9

In [14]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 90.0%(test)


In [15]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

vocab = train_dataset.get_vocab()
model = model.to("cpu")


In [16]:
text = "A horror version of Fantasy Island, frankly, sounds more interesting than a \
        conventional reboot, which explains why Sony would hand the keys to Blumhouse, \
        the reigning maestro of that genre, and let them run with it. \
        That seed of potential, however, sails away on a tide of numbing stupidity. \
        A frantic woman races through the jungle as the movie begins, serving notice \
        that this isn't grandma's Fantasy Island, the escapist TV show that premiered in 1978. \
        That tease is followed by more familiar images, as a handful of contest winners \
        land on an idyllic island (played by Fiji, incidentally), before being ushered in \
        to meet their host, the mysterious Mr. Roarke (Michael Pena), who walks them through the rules. \
        Their fantasy, he explains, will be as real as you make it, in a locale where anything and \
        everything is possible. But they must see each experience through to its conclusion, setting \
        them on disparate adventures, which -- barring the odd moment of creepiness -- \
        start promisingly enough, before becoming increasingly fantastic and eventually, deadly. \
        The movie exhibits promise at first too, if only because it's hard to anticipate \
        where all this is going, in a The Twilight Zone kind of way. The lesson appears to \
        involve being careful what you wish for -- a tried-and-true wrinkle of such fare. \
        Gwen (Maggie Q), for example, has the chance to undo a choice that took her life in \
        a completely different direction, while Melanie (Lucy Hale) plots sweet revenge  \
        against a woman (Mr. Robot's Portia Doubleday) who tormented her in school. \
        Lucy Hale, Austin Stowell and Michael Peña in 'Fantasy Island.'\
        Gradually, though, the situations conjured courtesy of director/co-writer \
        Jeff Wadlow (Blumhouse's Truth or Dare) become more and more outlandish, and make \
        less and less sense. By the time an inkling of what's going on comes into focus, \
        any reasonable person would have long since asked where and when they can claim their \
        luggage and disembark.\
        It's a shame, since the general idea of taking creative liberties with such a title -- \
        one with which the target audience probably identifies by name only -- sounds fertile. \
        While there are amusing if somewhat obvious callbacks to the original (yes, someone yells \
        The plane!), the assumption is clearly that the demo most likely to see the movie \
        couldn't pick Ricardo Montalban out of a lineup.\
        Blumhouse -- whose hits include Get Out and Happy Death Day -- has been extraordinarily\
        shrewd about mining and stretching the parameters of horror, as well as leveraging familiar\
        concepts in different ways. (The studio will put its stamp on another well-worn \
        premise next month, with a new version of The Invisible Man.)\
        For the squeamish, it's somewhat reassuring to note that Fantasy Island delivers\
        PG-13-level scares, so the action isn't particularly grisly, just awfully silly.\
        Granted, one person's fantasy can easily be another's nightmare, but in this case,\
        the likely effect on an even moderately discriminating viewer will merely be a \
        nagging headache. The bottom line is that visiting Fantasy Island -- even on \
        someone else's dime -- isn't a trip worth taking."

print("This is a %s news" %ag_news_label[predict(text, model, vocab, 2)])


This is a Sci/Tec news


In [17]:
text1 = "The Army will not investigate Lt. Col. Alexander Vindman, the former \
        National Security Council staffer who testified in the president’s \
        impeachment investigation, the service’s top civilian said Friday.\
        Army Secretary Ryan McCarthy made the announcement at an event just \
        days after President Donald Trump said he imagined the military would \
        take a look at whether Vindman should face disciplinary action for the \
        horrible things he told House investigators about the president’s \
        phone call with Ukrainian President Volodymyr Zelensky last July.\
        Vindman was ousted from his position on the NSC last week after the \
        Senate acquitted Trump. Vindman’s lawyer said the move amounted to retribution.\
        McCarthy on Friday downplayed Vindman’s return to the Army, saying he simply \
        returned to the service a few months earlier than planned and would have a \
        bridging assignment for a couple of months in the Army’s headquarters office in Washington.\
        Then he will be heading to a senior service college this summer. \
        There’s no investigation into him, McCarthy said at a National Press Club luncheon.\
        On Tuesday, Trump told reporters if you look at what happened, \
        [the military is] going to certainly, I would imagine, take a look at that.\
        It turned out that what he reported was very different [than what occurred], \
        Trump added. And also when you look at the person he reports to, said horrible \
        things, avoided the chain of command, leaked, did a lot of bad things. \
        And so we sent him on his way to a much different location, and the military \
        can handle him anyway they want."

print("The ground truth is Politics")
print("The predicted category is %s news" %ag_news_label[predict(text1, model, vocab, 2)])


The ground truth is Politics
The predicted category is Sports news


In [18]:
text2 = "Cramped in a small submarine 2,500 meters below the Pacific’s surface in 2006, \
        microbiologist Hiroyuki Imachi scanned the ocean floor for signs of microbial life.\
        As the sub drifted over the bottom of Japan’s Nankai Trough — a hotbed of \
        understudied microbes living off methane bubbling up from tectonic faults — \
        Imachi spotted a nest of small clams against a whitish microbial mat, \
        suggestive of an active methane seep below. The submersible’s robotic \
        arm plunged a 25-centimeter tube into the blackish-gray sediment to retrieve a core of muck.\
        It would take another 12 years of lab work for Imachi and colleagues \
        to isolate a prize they hadn’t even set out to find — a single-celled \
        microbe from an ancient lineage of Archaea, a domain of life \
        superficially similar to bacteria. That find could help biologists \
        reconstruct one of life’s greatest leaps toward complexity, from \
        simple bacteria-like organisms to more complicated eukaryotes, \
        the enormous group of chromosome-carrying creatures that includes \
        humans, platypuses, fungi and many others.  \
        Patience is very important in doing successful science, says Imachi, \
        of the Japan Agency for Marine-Earth Science and Technology in Yokosuka. \
        He and his colleagues published their findings in the Jan. \
        23 Nature, to enthusiastic acclaim from fellow microbiologists. \
        I’m very lucky. The Japanese research vessel Shinkai 6500 dove \
        2,500 meters into the Nankai Trough off of Japan’s Kii peninsula \
        to sample the microbial diversity in the sediment around a methane seep in 2006. \
        Many scientists think an unusual meal kicked off the evolution of more complicated \
        cells about 2 billion years ago. An ancient archaean, the theory goes, \
        gobbled up a bacterium that, instead of being dinner, sparked a symbiotic \
        relationship in a process called endosymbiosis (SN: 6/8/74). Eventually, \
        the bacterium evolved into mitochondria, the energy-producing cellular \
        structures that fueled the rise of complex life.\
        Living remnants of ancient archaeal lineages persist in some of Earth’s \
        most extreme environments, and scientists are exploring these microbial \
        hot spots for clues about the ancestor of all eukaryotes. \
        One such environment is the deep-sea floor. Despite making up about \
        65 percent of Earth’s surface, biologists have only a faint picture \
        of the microbial multitudes that thrive there. Genetic sequencing of \
        dredged up mud has given biologists one way of studying these communities \
        of bacteria and archaea uniquely adapted to the cold, oxygen-less deep. \
        But genes can reveal only so much.So scientists seek to grow cultures of \
        microbes in the lab to study what these organisms look like and how they behave. \
        But extreme microbes present unique challenges. Simply plating these organisms \
        on a petri dish, providing nutrients and waiting for growth hadn’t ever worked — \
        possibly because scientists weren’t effectively re-creating the microbes’ extreme \
        environment, says Masaru Nobu, a microbiologist at the National Institute of \
        Advanced Industrial Science and Technology in Tsukuba, Japan, who joined Imachi’s \
        project after it started. So Imachi, Nobu and their colleagues tried to \
        re-create a methane seep in the lab, drawing inspiration from a bioreactor \
        used to treat municipal sewage. The team pumped methane gas into a meter-tall \
        cylindrical chamber, kept at 10° Celsius and stacked with polyurethane sponges \
        that mimic porous deep-sea sediment. A slow, steady flow of artificial seawater \
        kept the sponges saturated. The team then watered down a clump of mud from the \
        Nankai Trough sediment core, sopped up the slurry with the sponges, stacked them \
        in the reactor — and waited. There was a lot of nervousness, Nobu says of that \
        time in December 2006. “We didn’t know if we’d get what we wanted. \
        Cramped in a small submarine 2,500 meters below the Pacific’s surface in 2006, \
        microbiologist Hiroyuki Imachi scanned the ocean floor for signs of microbial life. \
        As the sub drifted over the bottom of Japan’s Nankai Trough — a hotbed of \
        understudied microbes living off methane bubbling up from tectonic faults — \
        Imachi spotted a nest of small clams against a whitish microbial mat, \
        suggestive of an active methane seep below. The submersible’s robotic arm \
        plunged a 25-centimeter tube into the blackish-gray sediment to retrieve a core of muck.\
        It would take another 12 years of lab work for Imachi and colleagues to isolate a \
        prize they hadn’t even set out to find — a single-celled microbe from an ancient \
        lineage of Archaea, a domain of life superficially similar to bacteria. \
        That find could help biologists reconstruct one of life’s greatest leaps \
        toward complexity, from simple bacteria-like organisms to more complicated \
        eukaryotes, the enormous group of chromosome-carrying creatures that includes \
        humans, platypuses, fungi and many others.  \
        Patience is very important in doing successful science, says Imachi, of \
        the Japan Agency for Marine-Earth Science and Technology in Yokosuka. \
        He and his colleagues published their findings in the Jan. 23 Nature, \
        to enthusiastic acclaim from fellow microbiologists. “I’m very lucky."

print("The ground truth is Science")
print("The predicted category is %s news" %ag_news_label[predict(text2, model, vocab, 2)])


The ground truth is Science
The predicted category is Sci/Tec news
