# Chapter 5: Text Classification

In [1]:
import spacy
import torch
import torchtext
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torchtext import data

## Loading & Data Cleaning

In [2]:
device = "mps"
torch.backends.mps.is_available()

True

In [3]:
# You'll probably need to use the 'python' engine to load the CSV
# tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None)
tweetsDF = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin1', header=None)

In [4]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
tweetsDF[0].value_counts()

0
4    10001
0     9999
Name: count, dtype: int64

In [6]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [7]:
tweetsDF["sentiment_cat"]

0        0
1        0
2        0
3        0
4        0
        ..
19995    4
19996    4
19997    4
19998    4
19999    4
Name: sentiment_cat, Length: 20000, dtype: category
Categories (2, int64): [0, 4]

In [8]:
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes

In [9]:
tweetsDF["sentiment"]

0        0
1        0
2        0
3        0
4        0
        ..
19995    1
19996    1
19997    1
19998    1
19999    1
Name: sentiment, Length: 20000, dtype: int8

In [13]:
tweetsDF.to_csv("train-processed.csv", header=None, index=None)      
tweetsDF.sample(10000).to_csv("train-processed-sample.csv", header=None, index=None) 

In [14]:
from torchtext import data
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True)
# using LABEL and TWEET to define the fields below the rest are ignored
# the below fields will m=be mapped to CSV columns
fields = [('score',None), ('id',None), ('date',None), ('query',None),
          ('name',None), ('tweet', TWEET), ('category',None), ('label',LABEL)]

## Create our Dataset and DataLoaders

In [15]:
twitterDataset = data.dataset.TabularDataset(
        path="train-processed-sample.csv", 
        format="CSV", 
        fields=fields,
        skip_header=False)
twitterDataset.fields

{'score': None,
 'id': None,
 'date': None,
 'query': None,
 'name': None,
 'tweet': <torchtext.data.field.Field at 0x14111aad0>,
 'category': None,
 'label': <torchtext.data.field.LabelField at 0x173b10950>}

In [16]:
(train, test, valid) = twitterDataset.split(split_ratio=[0.6,0.2,0.2],
                                            stratified=True, strata_field='label')

(len(train),len(test),len(valid))

(6000, 2000, 2000)

In [17]:
vars(train.examples[7])

{'tweet': ['@diysara',
  'sounds',
  'like',
  'a',
  'wonderful',
  'day',
  'planned',
  'for',
  'tomorrow',
  '.',
  ' ',
  'it',
  'should',
  'bring',
  'you',
  'smiles',
  'today',
  'thinking',
  'about',
  'it',
  '&',
  'amp',
  ';',
  'looking',
  'forward',
  'to',
  'it',
  '.'],
 'label': '1'}

In [18]:
vocab_size = 20000
TWEET.build_vocab(train, max_size = vocab_size)
len(TWEET.vocab)

13386

In [19]:
TWEET.vocab.freqs.most_common(10)

[('i', 3590),
 ('!', 3208),
 ('.', 2946),
 ('to', 2214),
 (' ', 2125),
 (',', 1989),
 ('the', 1920),
 ('a', 1468),
 ('and', 1225),
 ('my', 1196)]

In [24]:
TWEET.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x173db7390>>,
            {'<unk>': 0,
             '<pad>': 1,
             'i': 2,
             '!': 3,
             '.': 4,
             'to': 5,
             ' ': 6,
             ',': 7,
             'the': 8,
             'a': 9,
             'and': 10,
             'my': 11,
             'it': 12,
             'you': 13,
             'is': 14,
             'for': 15,
             '?': 16,
             '...': 17,
             'in': 18,
             'of': 19,
             "'s": 20,
             'on': 21,
             'that': 22,
             "n't": 23,
             'have': 24,
             'me': 25,
             'so': 26,
             'but': 27,
             'do': 28,
             "'m": 29,
             'just': 30,
             '-': 31,
             'at': 32,
             'not': 33,
             'be': 34,
             'with': 35,
             'this': 36,
             'good': 37,
             'da

In [20]:
LABEL.build_vocab(train)

In [22]:
LABEL.vocab.stoi

defaultdict(None, {'0': 0, '1': 1})

In [25]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = 32,
    device = device,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False)

## Our First LSTM

In [26]:
class OurFirstLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(OurFirstLSTM, self).__init__()
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,  
                hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)

    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds

model = OurFirstLSTM(100,300, 20002)
model.to(device)

OurFirstLSTM(
  (embedding): Embedding(20002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

## Training

In [27]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
    for epoch in range(1, epochs+1):
     
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch_idx, batch in enumerate(train_iterator):
            optimizer.zero_grad()
            predict = model(batch.tweet)
            loss = criterion(predict,batch.label)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * batch.tweet.size(0)
            print(f'Epoch: {epoch}, loss.data.item(): {loss.data.item()}, batch.tweet.size(0): {batch.tweet.size(0)}')
        training_loss /= len(train_iterator)
        print(f'training_loss /= len(train_iterator): {training_loss / len(train_iterator)}, training_loss: {training_loss}, len(train_iterator): {len(train_iterator)}')
 
        
        model.eval()
        for batch_idx,batch in enumerate(valid_iterator):
            predict = model(batch.tweet)
            loss = criterion(predict,batch.label)
            valid_loss += loss.data.item() * batch.tweet.size(0)
 
        valid_loss /= len(valid_iterator)
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [28]:
train(5, model, optimizer, criterion, train_iterator, valid_iterator)        

Epoch: 1, loss.data.item(): 0.706108570098877, batch.tweet.size(0): 33
Epoch: 1, loss.data.item(): 1.7492945194244385, batch.tweet.size(0): 32
Epoch: 1, loss.data.item(): 1.1907867193222046, batch.tweet.size(0): 32
Epoch: 1, loss.data.item(): 0.7320588231086731, batch.tweet.size(0): 33
Epoch: 1, loss.data.item(): 0.8326799273490906, batch.tweet.size(0): 32
Epoch: 1, loss.data.item(): 0.75807785987854, batch.tweet.size(0): 40
Epoch: 1, loss.data.item(): 1.0076727867126465, batch.tweet.size(0): 30
Epoch: 1, loss.data.item(): 0.7613364458084106, batch.tweet.size(0): 34
Epoch: 1, loss.data.item(): 0.7509464025497437, batch.tweet.size(0): 38
Epoch: 1, loss.data.item(): 0.7172070145606995, batch.tweet.size(0): 37
Epoch: 1, loss.data.item(): 0.68511962890625, batch.tweet.size(0): 35
Epoch: 1, loss.data.item(): 0.6943508386611938, batch.tweet.size(0): 36
Epoch: 1, loss.data.item(): 0.7985957860946655, batch.tweet.size(0): 31
Epoch: 1, loss.data.item(): 0.845305323600769, batch.tweet.size(0): 3

## Making predictions

In [46]:
def classify_tweet(tweet):
    categories = {0: "Negative", 1:"Positive"}
    processed = TWEET.process([TWEET.preprocess(tweet)])
    processed = processed.to(device)
    model.eval()
    return categories[model(processed).argmax().item()]

## Data Augmentation

In [None]:
def random_deletion(words, p=0.5):
    if len(words) == 1:
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    if len(remaining) == 0:
        return [random.choice(words)]
    else:
        return remaining

In [None]:
def random_swap(sentence, n=5):
    length = range(len(sentence))
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

In [None]:
# Note: you'll have to define remove_stopwords() and get_synonyms() elsewhere

def random_insertion(sentence,n):
    words = remove_stopwords(sentence)
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym)
    return sentence

In [None]:
# Install googletrans version 3.1.0a0 (temporary fix for #57)
!pip install googletrans==3.1.0a0

In [None]:
import googletrans
import random

translator = googletrans.Translator()

sentences = ['The cat sat on the mat']

translations_fr = translator.translate(sentences, dest='fr')
fr_text = [t.text for t in translations_fr] 
translations_en = translator.translate(fr_text, dest='en')
en_text = [t.text for t in translations_en]
print(en_text)   

available_langs = list(googletrans.LANGUAGES.keys())
tr_lang = random.choice(available_langs)
print(f"Translating to {googletrans.LANGUAGES[tr_lang]}")

translations = translator.translate(sentences, dest=tr_lang)
t_text = [t.text for t in translations]
print(t_text)

translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')
en_text = [t.text for t in translations_en_random]
print(en_text)