<a href="https://colab.research.google.com/github/sayanbanerjee32/NLP-with-fastai2.0/blob/main/fastai_on_20newsgroup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install fastai 2 version

uncomment if required

In [1]:
#!pip install fastai --upgrade

# Data
Import data

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [3]:
def twenty_newsgroup_to_df(data_set = 'train'):
    newsgroups_train = fetch_20newsgroups(subset=data_set, 
                                          remove=('headers', 'footers',
                                                  'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame(newsgroups_train.target_names)
    targets.columns = ['title']

    train = pd.merge(df, targets, left_on='target', right_index=True)
    #out['date'] = pd.to_datetime('now')
    #out.to_csv('20_newsgroup.csv')
    return train


train_data_df = twenty_newsgroup_to_df('train')
test_data_df = twenty_newsgroup_to_df('test')

In [4]:
train_data_df.info(),test_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11314 entries, 0 to 11302
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11314 non-null  object
 1   target  11314 non-null  object
 2   title   11314 non-null  object
dtypes: object(3)
memory usage: 353.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7532 entries, 0 to 7519
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7532 non-null   object
 1   target  7532 non-null   object
 2   title   7532 non-null   object
dtypes: object(3)
memory usage: 235.4+ KB


(None, None)

Number of classes in train and test

In [5]:
len(train_data_df.target.unique()),len(train_data_df.title.unique())

(20, 20)

In [6]:
len(test_data_df.target.unique()),len(test_data_df.title.unique())

(20, 20)

# Initial model using transfer learning

In [7]:
from fastai.text.all import *

## create data loader from data frame

Defining a seed for reproducible results

In [8]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [21]:
random_seed(32, True)
dls = TextDataLoaders.from_df(train_data_df, text_col='text', label_col='target')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj due to the resolution and size it is in 14 parts . \n\n xxmaj this is a uuencoded bitmap . xxunk 256 colors . \n xxmaj the picture is a xxunk xxunk on a desert with blue sky background . \n xxmaj the size is just right for centered wallpaper on a 1024x768 display \n because it leaves a border at the bottom just big enough for icons \n to fit in without being on top of the picture . xxmaj xxunk image \n quality and resolution - i have not seen much better . \n\n xxmaj for those of you who have n't worked with xxunk image files , here \n is how to put it back together . \n\t 1 . save the 14 parts to 14 individual files \n\t 2 . use a text editor to remove the header and xxunk \n\t▁ in each file",2
1,"xxbos xxrep 10 - cut here xxrep 10 - part 01 / 01 \n begin 644 xxunk \n xxup xxunk 1d & -$ , _ _ \ @ p $ ? # _ ' @ ! xxunk ( # $ y. # xxup dm \n xxup xxunk < v5r = xxunk * \n m _ "" z , # f \ $ h0 ( xxup c , xxunk ( . \ -r xxup rt ! xxunk / j \ * xxunk / xxup xxunk . \n xxup xxunk * % [ ! * xxunk xxup xxunk "" zc < xxunk & xxunk + h8 ! xxunk \n xxup xxunk _ # xxup xxunk / xxup xxunk [ b \ xxup xxunk , \n xxup xxunk / xxunk / / \ xxup xxunk / [ xxunk xxunk \ xxup xxunk # / \n xxup xxunk > xxunk # xxup cl xxup",5
2,"xxbos xxmaj accounts of anti - armenian xxmaj human xxmaj right xxmaj violations in xxmaj azerbaijan # 008 xxmaj part b \n▁ xxmaj prelude to xxmaj current xxmaj events in nagorno - karabakh \n\n\t\t\t\t ( part b of # 008 ) \n\n▁ + xxrep 66 - + \n▁ | | \n▁ | "" oh , yes , i just remembered . xxmaj while they were raping me they | \n▁ | repeated quite frequently , "" let the xxmaj armenian women have babies | \n▁ | for us , xxmaj muslim babies , let them bear xxmaj azerbaijanis for the | \n▁ | struggle against the xxmaj armenians . "" xxmaj then they said , "" those | \n▁ | xxmaj muslims can carry on our holy cause . xxmaj heroes ! "" xxmaj they repeated | \n▁ | it very often . "" | \n▁ | | \n▁ +",16


## create text learner

use F1 score as metrics and find optimum learning rate

In [22]:
#dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
fscore = F1Score(average='weighted')
#ls_func = CrossEntropyLossFlat()
random_seed(32, True)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, 
                                #loss_func = ls_func,
                                metrics=[accuracy,fscore])
learn.fine_tune(2, 1e-2)
#learn.lr_find()

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.056136,0.056145,0.045367,01:05


epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.0191,0.045093,0.028286,02:05
1,,3.016025,0.043767,0.037344,02:12


Tune the model with suggested learning rate that is min learning rate devided by 10.

In [11]:
#learn.fit_one_cycle(1, 0.001) # min / 10
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.032755,0.037577,0.029659,01:06


In [12]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.01296,0.05084,0.040125,01:15


In [13]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.010344,0.042882,0.036728,01:47


In [14]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,,3.022345,0.043767,0.038739,02:11
1,,3.026042,0.041556,0.038303,02:12


In [15]:
#learn.fine_tune(5,0.01)

## Predict on test data

In [16]:
predict_df = test_data_df.text.apply(learn.predict)
#predict_df.head()
predict_df_class = [x[0] for x in predict_df.values]
predict_df_prob = [max(x[2].tolist()) for x in predict_df.values]
print(predict_df_class[:10])
print(predict_df_prob[:10])

['15', '15', '15', '9', '12', '12', '12', '9', '12', '17']
[0.07944145053625107, 0.08027864992618561, 0.08629996329545975, 0.07384373992681503, 0.0765143483877182, 0.07064009457826614, 0.07354335486888885, 0.0668676346540451, 0.08083569258451462, 0.08404545485973358]


## Classification report

In [17]:
from sklearn.metrics import classification_report

In [18]:
act_class = [str(c) for c in test_data_df.target.values]
len(np.unique(act_class))

20

In [19]:
print(classification_report(act_class, predict_df_class))

              precision    recall  f1-score   support

           0       0.07      0.01      0.01       319
           1       0.01      0.01      0.01       389
          10       0.02      0.01      0.01       399
          11       0.05      0.05      0.05       396
          12       0.08      0.08      0.08       393
          13       0.06      0.07      0.06       396
          14       0.06      0.03      0.04       394
          15       0.00      0.01      0.00       398
          16       0.06      0.07      0.07       364
          17       0.06      0.11      0.08       376
          18       0.02      0.00      0.01       310
          19       0.09      0.00      0.01       251
           2       0.09      0.10      0.09       394
           3       0.03      0.03      0.03       392
           4       0.03      0.05      0.04       385
           5       0.05      0.06      0.05       395
           6       0.04      0.07      0.05       390
           7       0.04    