# Chapter 5: Text Classification

In [1]:
import spacy
import torchtext
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
import torch

## Loading & Data Cleaning

In [2]:
device = "cuda"

In [3]:
# You'll probably need to use the 'python' engine to load the CSV
# tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None)
tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", engine="python", header=None, encoding='ISO-8859-1')


In [4]:
tweetsDF[0].value_counts()

0
4    10001
0     9999
Name: count, dtype: int64

In [5]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [6]:
tweetsDF.columns

Index([0, 1, 2, 3, 4, 5, 'sentiment_cat'], dtype='object')

In [7]:
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes

In [8]:
tweetsDF["sentiment"]

0        0
1        0
2        0
3        0
4        0
        ..
19995    1
19996    1
19997    1
19998    1
19999    1
Name: sentiment, Length: 20000, dtype: int8

In [9]:
tweetsDF.to_csv("train-processed.csv", header=None, index=None)      

In [10]:
tweetsDF.sample(10000).to_csv("train-processed-sample.csv", header=None, index=None) 

In [11]:
LABEL = data.LabelField()
TWEET = data.Field('spacy', tokenizer_language='en_core_web_sm', lower=True)

fields = [('score',None), ('id',None), ('date',None), ('query',None),
          ('name',None), ('tweet', TWEET), ('category',None), ('label',LABEL)]

## Create our Dataset and DataLoaders

In [12]:
twitterDataset = data.dataset.TabularDataset(
        path="train-processed-sample.csv", 
        format="CSV", 
        fields=fields,
        skip_header=False)

In [16]:
(train, test, valid) = twitterDataset.split(split_ratio=[0.6,0.2,0.2],
                stratified=True, strata_field='label')

(len(train),len(test),len(valid))

(6000, 2000, 2000)

In [20]:
vars(train.examples[765])

{'tweet': ['being',
  'grumpy.',
  'abu',
  'dhabi',
  'is',
  'so',
  'boring.',
  'there',
  'really',
  'is',
  'nothing',
  'to',
  'photograph',
  'and',
  'no',
  'one',
  'to',
  'go',
  'with'],
 'label': '0'}

In [26]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)
LABEL.build_vocab(train)
TWEET.vocab.freqs.most_common(20)

[('i', 2702),
 ('to', 2221),
 ('the', 2028),
 ('a', 1431),
 ('my', 1180),
 ('and', 1118),
 ('is', 965),
 ('you', 851),
 ('in', 828),
 ('for', 825),
 ('it', 760),
 ('of', 752),
 ('on', 590),
 ('so', 528),
 ('have', 510),
 ("i'm", 473),
 ('but', 473),
 ('me', 469),
 ('that', 466),
 ('at', 440)]

In [27]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = 32,
    device = device,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False)

## Our First LSTM

In [29]:
class OurFirstLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(OurFirstLSTM, self).__init__()
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,  
                hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)

    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds

model = OurFirstLSTM(100,300, 20002)
# model.to(device)

## Training

In [30]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
    for epoch in range(1, epochs+1):
     
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            predict = model(batch.tweet)
            loss = criterion(predict,batch.label)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * batch.tweet.size(0)
        training_loss /= len(train_iterator)
 
        
        model.eval()
        for batch_idx,batch in enumerate(valid_iterator):
            predict = model(batch.tweet)
            loss = criterion(predict,batch.label)
            valid_loss += loss.data.item() * batch.tweet.size(0)
 
        valid_loss /= len(valid_iterator)
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [31]:
train(5, model, optimizer, criterion, train_iterator, valid_iterator)        

AssertionError: Torch not compiled with CUDA enabled

## Making predictions

In [46]:
def classify_tweet(tweet):
    categories = {0: "Negative", 1:"Positive"}
    processed = TWEET.process([TWEET.preprocess(tweet)])
    processed = processed.to(device)
    model.eval()
    return categories[model(processed).argmax().item()]

## Data Augmentation

In [None]:
def random_deletion(words, p=0.5):
    if len(words) == 1:
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    if len(remaining) == 0:
        return [random.choice(words)]
    else:
        return remaining

In [None]:
def random_swap(sentence, n=5):
    length = range(len(sentence))
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

In [None]:
# Note: you'll have to define remove_stopwords() and get_synonyms() elsewhere

def random_insertion(sentence,n):
    words = remove_stopwords(sentence)
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym)
    return sentence

In [None]:
# Install googletrans version 3.1.0a0 (temporary fix for #57)
!pip install googletrans==3.1.0a0

In [None]:
import googletrans
import random

translator = googletrans.Translator()

sentences = ['The cat sat on the mat']

translations_fr = translator.translate(sentences, dest='fr')
fr_text = [t.text for t in translations_fr] 
translations_en = translator.translate(fr_text, dest='en')
en_text = [t.text for t in translations_en]
print(en_text)   

available_langs = list(googletrans.LANGUAGES.keys())
tr_lang = random.choice(available_langs)
print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")

translations = translator.translate(sentences, dest=tr_lang)
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')
en_text = [t.text for t in translations_en_random]
print(en_text)

In [4]:
import torch
import os

In [2]:
torch.rand(1)

tensor([0.8026])

In [5]:
if os.fork(): print(f'In parent: {torch.rand(1)}')
else:
    print(f'In child: {torch.rand(1)}')
    os._exit(os.EX_OK)

In parent: tensor([0.2059])
In child: tensor([0.2059])


In [6]:
from random import random

In [10]:
if os.fork(): print(f'In parent: {random()}')
else:
    print(f'In child: {random()}')
    os._exit(os.EX_OK)

In parent: 0.14434929568933064
In child: 0.4794340068991143


In [26]:
if os.fork():
    print(f'In parent: {torch.rand(1)}')
else:
    torch.manual_seed(os.getpid())  # Reseed the RNG in the child process
    print(f'In child: {torch.rand(1)}')
    os._exit(os.EX_OK)

In parent: tensor([0.8511])


In child: tensor([0.1616])


In [None]:
embed = nn.Embedding(vocab_size, dimension_size)

In [25]:
cat_mat_embed = nn.Embedding(5, 3)
cat_tensor = torch.tensor([1])
cat_mat_embed.forward(cat_tensor)

tensor([[ 0.2169, -0.0104,  0.7701]], grad_fn=<EmbeddingBackward>)

In [27]:
cat_tensor

tensor([1])

In [28]:
!pip install torchtext


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
