## Text Classification

In [1]:
import spacy
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data
from sklearn.model_selection import train_test_split

from IPython.display import clear_output

from utils import display_classification_result

### Read data from CSV file

In [2]:
data_df = pd.read_csv("../data/spam.csv")

In [3]:
data_df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
print(len(data_df[data_df['label'] == 'spam']))
print(len(data_df[data_df['label'] == 'ham']))

747
4825


### Split data to train and validation

In [5]:
train_df, valid_df = train_test_split(data_df, test_size=0.1, random_state=1)

In [6]:
print(len(train_df), len(valid_df))

5014 558


In [7]:
train_df.to_csv('../data/spam-train.csv', index=False)
valid_df.to_csv('../data/spam-valid.csv', index=False)

In [8]:
train_df = pd.read_csv('../data/spam-train.csv')
valid_df = pd.read_csv('../data/spam-valid.csv')

valid_df

Unnamed: 0,label,text
0,ham,Convey my regards to him
1,ham,"[‰Û_] anyway, many good evenings to u! s"
2,ham,My sort code is and acc no is . The bank is n...
3,ham,Sorry i din lock my keypad.
4,spam,"Hi babe its Chloe, how r u? I was smashed on s..."
...,...,...
553,ham,Tyler (getting an 8th) has to leave not long a...
554,ham,K. I will sent it again
555,ham,Sday only joined.so training we started today:)
556,spam,FreeMsg Hey there darling it's been 3 week's n...


### Using torchtext
1. Define fields
2. Define datasets (train, validation, test)
3. Build vocabulary for each field
3. Define iterators for each dataset

In [9]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField()



In [10]:
train_data, valid_data = data.TabularDataset.splits(
    path="../data/",
    train='spam-train.csv',
    validation='spam-valid.csv',
    format='CSV',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

print(vars(train_data[0]))
print(vars(valid_data[0]))

{'label': 'ham', 'text': ['Gud', 'gud', '..', 'k', ',', 'chikku', 'tke', 'care', '..', 'sleep', 'well', 'gud', 'nyt']}
{'label': 'ham', 'text': ['Convey', 'my', 'regards', 'to', 'him']}


In [11]:
# Build vocabulary for texts
vocab_size = 20_000

TEXT.build_vocab(train_data,
                 max_size=vocab_size)

# Build vocabulary for labels
LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [12]:
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(train_data, batch_size, sort_key=lambda x: len(x.text), device=device)
valid_iterator = data.BucketIterator(
    valid_data, 
    batch_size, 
    sort_key=lambda x: len(x.text), 
    device=device,
    train=False,
    shuffle=False)

## Build Model

In [13]:
### FastText Model ###
class FastText(nn.Module):
    """ A simple model which first embeds the words in the input text and then averages them
        to create an embedding for the text. Then by using a linear layer, the text is mapped
        into one of the desired classess.
    """
    
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        """ Constructor
        
            Inputs:
                - vocab_size: number of unique words in the vocabulary
                - embedding_dim: size of embedding vectors
                - output_dim: number of classes
                - pad_idx: index of <PAD> token in the vocabulary
        """
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        # text = [sent len, batch size]
        embedded = self.embedding(text)      # embedded = [sent len, batch size, emb dim]
        embedded = embedded.permute(1, 0, 2) # embedded = [batch size, sent len, emb dim]
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) # pooled = [batch size, embedding_dim]
        return self.fc(pooled)

In [14]:
model = FastText(vocab_size=vocab_size, embedding_dim=100, output_dim=2, pad_idx=0)
model = model.to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)

## Train the model

In [15]:
N_EPOCHS = 5

for i in range(N_EPOCHS):
    
    # train
    model.train()
    
    train_loss, train_acc = 0.0, 0.0
    
    for batch in train_iterator:
        # forward step
        prediction = model(batch.text)
        loss = criterion(prediction, batch.label)
        acc = (prediction.argmax(1) == batch.label).float().mean()
        
        train_loss += loss.item()
        train_acc += acc
        
        print(f'Epoch {i+1} | Training | Loss = {loss:.4f} | Acc = {acc*100.0:.2f}')
        clear_output(wait=True)
        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # validation
    model.eval()
    valid_loss, valid_acc = 0.0, 0.0
    with torch.no_grad():
        for batch in valid_iterator:
            # forward step
            prediction = model(batch.text)
            loss = criterion(prediction, batch.label)
            acc = (prediction.argmax(1) == batch.label).float().mean()

            valid_loss += loss.item()
            valid_acc += acc

            print(f'Epoch {i+1} | Validation | Loss = {loss:.4f} | Acc = {acc*100.0:.2f}')
            clear_output(wait=True)
       
    
    clear_output(wait=True)
    train_loss /= len(train_iterator)
    train_acc /= len(train_iterator)
    valid_loss /= len(valid_iterator)
    valid_acc /= len(valid_iterator)
    
    print(f'Train Loss = {train_loss:.4f} | Train Acc = {train_acc*100:.2f} | Valid Loss = {valid_loss:.4f} | Valid Acc = {valid_acc*100:.2f}')


Train Loss = 0.0466 | Train Acc = 98.54 | Valid Loss = 0.0994 | Valid Acc = 95.89


### Testing model on user inputs

In [16]:
nlp = spacy.load("en_core_web_sm")


def predict(model, sentence):
    model.eval()
    
    # tokenize
    tokens = [t.text for t in nlp(sentence)]
    
    # numericalize
    indexed = [TEXT.vocab.stoi[t] for t in tokens]
    
    # convert to torch tensor and add batch dimension
    indexed = torch.LongTensor(indexed).unsqueeze(1).to(device)
    
    # predict the label
    prediction = model(indexed)
    
    return LABEL.vocab.itos[prediction.argmax(1).item()]
    

In [17]:
docs = ["Are you ready for the tea party????? It's gonna be wild",
        "URGENT Reply to this message for GUARANTEED FREE TEA"]


for doc in docs:
    label = predict(model, doc)
    display_classification_result(doc, label, LABEL.vocab.stoi[label])

ham     [41mAre you ready for the tea party????? It's gonna be wild[m
spam    [43mURGENT Reply to this message for GUARANTEED FREE TEA[m
