## ULMFiT - Text Classification

### Install the libraries if necessary

In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[31m  ERROR: Could not find a version that satisfies the requirement torch_nightly (from versions: none)[0m
[31mERROR: No matching distribution found for torch_nightly[0m
Collecting fastai
[?25l  Downloading https://files.pythonhosted.org/packages/c1/e2/42342ded0385d694e3250e74f43f0dc9a3ff3d5c2241a2ddd98236b5f9de/fastai-1.0.57-py3-none-any.whl (233kB)
[K     |████████████████████████████████| 235kB 2.9MB/s eta 0:00:01
Collecting fastprogress>=0.1.19 (from fastai)
  Downloading https://files.pythonhosted.org/packages/83/db/794db47024a26c75635c35f0ee5431aa8b528e895ad1ed958041290f83f7/fastprogress-0.1.21-py3-none-any.whl
Collecting spacy>=2.0.18 (from fastai)
[?25l  Downloading https://files.pythonhosted.org/packages/9a/2a/057bca697905031a6179227278a49fe9e7841a6d444bc2e13f1266b9f7dc/spacy-2.1.8-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_int

Installing collected packages: fastprogress, plac, murmurhash, cymem, srsly, preshed, wasabi, blis, thinc, spacy, torch, nvidia-ml-py3, torchvision, pynvx, fastai
  Running setup.py install for nvidia-ml-py3 ... [?25ldone
[?25hSuccessfully installed blis-0.2.4 cymem-2.0.2 fastai-1.0.57 fastprogress-0.1.21 murmurhash-1.0.2 nvidia-ml-py3-7.352.0 plac-0.9.6 preshed-2.0.1 pynvx-1.0.0 spacy-2.1.8 srsly-0.1.0 thinc-7.0.8 torch-1.2.0 torchvision-0.4.0 wasabi-0.2.2


### Import libraries

In [2]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

### Fetch the dataset

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [25]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Create a dataframe from the fetched dataset

In [4]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})

In [5]:
df.shape

(11314, 2)

In [6]:
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)

In [7]:
df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

### Clean the text

In [8]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

In [9]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/subashgandyer/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Prepare the data

In [10]:
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

### Split the dataset to train and test data

In [11]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

In [12]:
df_trn.shape, df_val.shape

((710, 2), (474, 2))

### Create Language Model and Classifier Model

In [13]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [26]:
data_lm

TextLMDataBunch;

Train: LabelList (710 items)
x: LMTextList
xxbos xxmaj it looks like xxmaj edmonton xxmaj oilers decided take xxmaj european xxunk spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger xxmaj corson playing xxmaj canada xxmaj podein xxmaj weight playing xxup us xxmaj is xxmaj kravchuk playing xxmaj xxunk i know nagging injuries late season xxmaj podein interesting case eligible play xxmaj cape xxmaj breton xxup ahl playoffs like xxmaj kovalev xxmaj zubov xxmaj andersson obviously xxmaj sather xxmaj pocklington total xxunk everyone makes certainly case massively xxunk xxmaj paramount xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this xxunk xxmaj speaking die hard i i read xxunk hard xxunk xxmaj toronto xxmaj cup finals xxmaj first anyone planet heard team xxmaj detroit xxmaj al xxmaj xxunk however spell idiot name must xxmaj chicago xxup espn said even close xxmaj chicago xxunk win xxmaj norris xxmaj division xxmaj playoffs team clo

In [27]:
data_clas

TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos xxmaj it looks like xxmaj edmonton xxmaj oilers decided take xxmaj european xxunk spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger xxmaj corson playing xxmaj canada xxmaj podein xxmaj weight playing xxup us xxmaj is xxmaj kravchuk playing xxmaj xxunk i know nagging injuries late season xxmaj podein interesting case eligible play xxmaj cape xxmaj breton xxup ahl playoffs like xxmaj kovalev xxmaj zubov xxmaj andersson obviously xxmaj sather xxmaj pocklington total xxunk everyone makes certainly case massively xxunk xxmaj paramount xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this xxunk xxmaj speaking die hard i i read xxunk hard xxunk xxmaj toronto xxmaj cup finals xxmaj first anyone planet heard team xxmaj detroit xxmaj al xxmaj xxunk however spell idiot name must xxmaj chicago xxup espn said even close xxmaj chicago xxunk win xxmaj norris xxmaj division xxmaj playoffs team clo

### Tokens explanation
- **xxunk** is for an unknown word (one that isn’t present in the current vocabulary)
- **xxpad** is the token used for padding, if we need to regroup several texts of different lengths in a batch
- **xxbos** represents the beginning of a text in your dataset
- **xxmaj** is used to indicate the next word begins with a capital in the original text
- **xxup** is used to indicate the next word is written in all caps in the original text

### Create the language model learner

In [16]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.7)

### 24% accuracy

In [17]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,6.093817,5.196084,0.24764,02:43


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

### Saving and loading

In [18]:
learn.save_encoder('ft_enc')

In [22]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos xxmaj it looks like xxmaj edmonton xxmaj oilers decided take xxmaj european xxunk spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger xxmaj corson playing xxmaj canada xxmaj podein xxmaj weight playing xxup us xxmaj is xxmaj kravchuk playing xxmaj xxunk i know nagging injuries late season xxmaj podein interesting case eligible play xxmaj cape xxmaj breton xxup ahl playoffs like xxmaj kovalev xxmaj zubov xxmaj andersson obviously xxmaj sather xxmaj pocklington total xxunk everyone makes certainly case massively xxunk xxmaj paramount xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this xxunk xxmaj speaking die hard i i read xxunk hard xxunk xxmaj toronto xxmaj cup finals xxmaj first anyone planet heard team xxmaj detroit xxmaj al xxmaj xxunk however spell idiot name must xxmaj chicago xxup espn said even close xxmaj chicago xxunk win xxmaj norris xxmaj division xxmaj p

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

### 95% accuracy

In [23]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.363133,0.189936,0.951477,06:13


### Confusion Matrix

In [24]:
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,215,4
1,19,236
