## Text Classification

In [1]:
import spacy
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data

from sklearn.model_selection import train_test_split
from IPython.display import clear_output

from utils import *
from config import RCNNConfig
from rcnn import RCNN
from training import train_model

import warnings
warnings.filterwarnings('ignore')

### Read data from CSV file

In [2]:
data_df = pd.read_csv("../data/spam.csv")

In [3]:
data_df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
print(len(data_df[data_df['label'] == 'spam']))
print(len(data_df[data_df['label'] == 'ham']))

747
4825


### Split data to train and validation

In [5]:
train_df, valid_df = train_test_split(data_df, test_size=0.1, random_state=1)

In [6]:
print(len(train_df), len(valid_df))

5014 558


In [7]:
train_df.to_csv('../data/spam-train.csv', index=False)
valid_df.to_csv('../data/spam-valid.csv', index=False)

In [8]:
train_df = pd.read_csv('../data/spam-train.csv')
valid_df = pd.read_csv('../data/spam-valid.csv')

valid_df

Unnamed: 0,label,text
0,ham,Convey my regards to him
1,ham,"[‰Û_] anyway, many good evenings to u! s"
2,ham,My sort code is and acc no is . The bank is n...
3,ham,Sorry i din lock my keypad.
4,spam,"Hi babe its Chloe, how r u? I was smashed on s..."
...,...,...
553,ham,Tyler (getting an 8th) has to leave not long a...
554,ham,K. I will sent it again
555,ham,Sday only joined.so training we started today:)
556,spam,FreeMsg Hey there darling it's been 3 week's n...


### Using torchtext
1. Define fields
2. Define datasets (train, validation, test)
3. Build vocabulary for each field
3. Define iterators for each dataset

In [9]:
TEXT = data.Field(tokenize='spacy')

LABEL = data.LabelField()

In [10]:
train_data, valid_data = data.TabularDataset.splits(
    path="../data/",
    train='spam-train.csv',
    validation='spam-valid.csv',
    format='CSV',
    skip_header=True,
    fields=[('label', LABEL), ('text', TEXT)]
)

print(vars(train_data[0]))
print(vars(valid_data[0]))

{'label': 'ham', 'text': ['Gud', 'gud', '..', 'k', ',', 'chikku', 'tke', 'care', '..', 'sleep', 'well', 'gud', 'nyt']}
{'label': 'ham', 'text': ['Convey', 'my', 'regards', 'to', 'him']}


In [11]:
# Build vocabulary for texts
vocab_size = 20_000

TEXT.build_vocab(train_data,
                 max_size=vocab_size)

# Build vocabulary for labels
LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [13]:
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_data, 
    batch_size, 
    sort_key=lambda x: len(x.text), 
    device=device)

valid_iterator = data.BucketIterator(
    valid_data, 
    batch_size, 
    sort_key=lambda x: len(x.text), 
    device=device,
    train=False,
    shuffle=False)

## Build Model

In [14]:
config = RCNNConfig()

model = RCNN(config, vocab_size)
model = model.to(device)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
criterion = nn.CrossEntropyLoss().to(device)

## Train the model

In [15]:
fname = f'models/rcnn.pt'  # save the trained model

train_model(model, device, train_iterator, valid_iterator, optimizer, criterion, scheduler, n_epochs=5, fname=fname)

Unnamed: 0,Epoch,Train Loss,Valid Loss,Train Acc,Valid Acc,Time
0,1,0.223,0.123,91.66,96.25,0m 8s
1,2,0.082,0.085,97.89,97.32,0m 8s
2,3,0.05,0.11,98.88,95.89,0m 8s
3,4,0.037,0.127,99.2,95.54,0m 8s
4,5,0.03,0.104,99.44,96.79,0m 8s


### Testing model on user inputs

In [16]:
nlp = spacy.load("en_core_web_sm")


def predict(model, sentence, device=device):
    model.eval()
    
    # tokenize
    tokens = [t.text for t in nlp(sentence)]
    
    # numericalize
    indexed = [TEXT.vocab.stoi[t] for t in tokens]
    
    # convert to torch tensor and add batch dimension
    indexed = torch.LongTensor(indexed).unsqueeze(1).to(device)
    
    # predict the label
    prediction = model(indexed)
    
    return LABEL.vocab.itos[prediction.argmax(1).item()]
    

In [17]:
docs = ["Are you ready for the tea party????? It's gonna be wild",
        "URGENT Reply to this message for GUARANTEED FREE TEA"]


for doc in docs:
    label = predict(model, doc)
    display_classification_result(doc, label, LABEL.vocab.stoi[label])

ham     [41mAre you ready for the tea party????? It's gonna be wild[m
spam    [43mURGENT Reply to this message for GUARANTEED FREE TEA[m
