<a href="https://colab.research.google.com/github/sayanbanerjee32/NLP-with-fastai2.0/blob/main/fastai_on_20newsgroup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install fastai 2 version

uncomment if required

In [1]:
#!pip install fastai --upgrade

# Data
Import data

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [3]:
from fastai.text.all import *

In [4]:
def twenty_newsgroup_to_df(data_set = 'train'):
    newsgroups_data = fetch_20newsgroups(subset=data_set, 
                                          remove=('headers', 'footers',
                                                  'quotes'))

    df = pd.DataFrame([newsgroups_data.data, 
                       newsgroups_data.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame(newsgroups_data.target_names)
    targets.columns = ['title']

    data_set = pd.merge(df, targets, left_on='target', right_index=True)
    #out['date'] = pd.to_datetime('now')
    #out.to_csv('20_newsgroup.csv')
    return data_set


train_data_df = twenty_newsgroup_to_df('train')
test_data_df = twenty_newsgroup_to_df('test')

In [5]:
train_data_df.info(),test_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11314 entries, 0 to 11302
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11314 non-null  object
 1   target  11314 non-null  object
 2   title   11314 non-null  object
dtypes: object(3)
memory usage: 353.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7532 entries, 0 to 7519
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7532 non-null   object
 1   target  7532 non-null   object
 2   title   7532 non-null   object
dtypes: object(3)
memory usage: 235.4+ KB


(None, None)

Number of classes in train and test

In [6]:
len(train_data_df.target.unique()),len(train_data_df.title.unique())

(20, 20)

In [7]:
len(test_data_df.target.unique()),len(test_data_df.title.unique())

(20, 20)

## is_validation column

In [8]:
 np.random.seed(1)
 msk = np.random.rand(train_data_df.shape[0]) < 0.25
 train_data_df['is_valid'] = msk
 print(train_data_df.groupby(['is_valid','title'])['text'].count())

is_valid  title                   
False     alt.atheism                 339
          comp.graphics               446
          comp.os.ms-windows.misc     437
          comp.sys.ibm.pc.hardware    436
          comp.sys.mac.hardware       432
          comp.windows.x              435
          misc.forsale                437
          rec.autos                   444
          rec.motorcycles             460
          rec.sport.baseball          424
          rec.sport.hockey            443
          sci.crypt                   455
          sci.electronics             435
          sci.med                     453
          sci.space                   459
          soc.religion.christian      457
          talk.politics.guns          391
          talk.politics.mideast       439
          talk.politics.misc          362
          talk.religion.misc          289
True      alt.atheism                 141
          comp.graphics               138
          comp.os.ms-windows.misc     154

# Initial model using transfer learning

## create data loader from data frame

Defining a seed for reproducible results

In [9]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [10]:
#random_seed(32, True)
dls = TextDataLoaders.from_df(train_data_df, text_col='text', label_col='title',
                              valid_col='is_valid', seed=1, shuffle_train=True)
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj due to the resolution and size it is in 14 parts . \n\n xxmaj this is a uuencoded bitmap . xxunk 256 colors . \n xxmaj the picture is a xxunk xxunk on a desert with blue sky background . \n xxmaj the size is just right for centered wallpaper on a 1024x768 display \n because it leaves a border at the bottom just big enough for icons \n to fit in without being on top of the picture . xxmaj xxunk image \n quality and resolution - i have not seen much better . \n\n xxmaj for those of you who have n't worked with xxunk image files , here \n is how to put it back together . \n\t 1 . save the 14 parts to 14 individual files \n\t 2 . use a text editor to remove the header and xxunk \n\t▁ in each file",comp.os.ms-windows.misc
1,"xxbos xxmaj archive - name : hockey - faq \n\n rec.sport.hockey answers to xxmaj frequently xxmaj asked xxmaj questions and other news : \n▁\n xxmaj contents : \n\n 0 . xxmaj new xxmaj info . \n 1 . xxup nhl \n 2 . xxup nhl xxmaj minor xxmaj leagues \n 3 . xxmaj college xxmaj hockey ( north xxmaj america ) \n 4 . xxmaj other leagues ( e.g. xxmaj europe , xxmaj canada xxmaj cup tournament ) \n 5 . xxmaj e - mail files \n 6 . xxup usenet xxmaj hockey xxmaj pool \n 7 . xxmaj up - coming xxmaj dates \n 8 . xxmaj answers to some frequently asked questions \n 9 . xxmaj miscellaneous \n▁\n▁ xxmaj send comments , suggestions and criticisms regarding this xxup faq list via xxunk \n mail to hamlet@u.washington.edu . \n▁\n▁ xxrep 74 - \n▁\n▁ 0 . xxmaj new xxmaj info",rec.autos
2,"xxbos xxrep 12 - xxmaj part 2 of 14 xxrep 12 - \n xxup m0 . / xxup xxunk / xxup cbn / xxup xxunk / xxup xxunk , xxunk , xxunk [ \n xxup xxunk \ _ -8 \n xxup my _ / xxup sys * / xxup xxunk \ xxup xxunk > / xxup c^ > / xxup cx_s \ xxup xxunk _ / xxup sc _ . / xxup xxunk _ . / xxup c _ / xxup gcx^ / xxup y _ . / \n m \ xxup x_scx^ / \ _ . / xxup c _ / s \ _ . / \ xxup x^ / \ xxup x^ / xxup cx_scx^ / xxup c _ . / xxup cx^ / \ xxup x^ / xxup c _ . / xxup cx^ / xxup cx^ / \n m \ xxup xxunk _ . / \",misc.forsale


## create text learner

use F1 score as metrics and find optimum learning rate

In [None]:
dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
fscore = F1Score(average='weighted')
#ls_func = CrossEntropyLossFlat()
#random_seed(32, True)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, 
                                #loss_func = ls_func,
                                metrics=[accuracy,fscore])
#learn.fine_tune(2, 1e-2)
learn.lr_find()

In [None]:
learn.fine_tune(1, 1e-2)

Tune the model with suggested learning rate that is min learning rate devided by 10.

In [None]:
#learn.fit_one_cycle(1, 0.001) # min / 10
#learn.fit_one_cycle(1, 2e-2)

In [None]:
#learn.freeze_to(-2)
#learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
#learn.freeze_to(-3)
#learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
#learn.unfreeze()
#learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

In [None]:
#learn.fine_tune(5,0.01)

## Predict on test data

In [None]:
predict_df = test_data_df.text.apply(learn.predict)
#predict_df.head()
predict_df_class = [x[0] for x in predict_df.values]
predict_df_prob = [max(x[2].tolist()) for x in predict_df.values]
print(predict_df_class[:10])
print(predict_df_prob[:10])

## Classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
act_class = [str(c) for c in test_data_df.title.values]
len(np.unique(act_class))

In [None]:
print(classification_report(act_class, predict_df_class))