<a href="https://colab.research.google.com/github/sayanbanerjee32/feedback-prize-effectiveness/blob/main/all_text_concat_ulmfit_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# install fastkaggle if not available
!pip install -Uq fastai
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

# !pip install -Uq 'timm>=0.6.2.dev'
!pip install -Uq pynvml
from fastkaggle import *

In [26]:
import os
from pathlib import Path
import fastai
fastai.__version__

'2.7.9'

In [27]:
# config depending on whether this is running on kaggle or collab
# is_colab = True
is_colab = not os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
comp = 'feedback-prize-effectiveness'
if is_colab:
    model_save_path = Path('/content/'+comp+'_out/models')
else:
    model_save_path = Path('/kaggle/working/'+comp) #+'/models')

In [28]:
# import colab libraries
if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so lets move it there.
if is_colab:
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/Kaggle_api_auth/kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [30]:
# This permissions change avoids a warning on Kaggle tool startup.
if is_colab:
    !chmod 600 ~/.kaggle/kaggle.json

In [31]:
path = setup_comp(comp)
path

Path('feedback-prize-effectiveness')

In [32]:
from fastai.text.all import *
set_seed(32)

## Language model for original text

In [33]:
# In this cell fastai TextDataLoader functions are overridden to allow taking output 
# directory as input so that the toknised data is not written in data input directory
# in Kaggle, data input directory is read-only
from fastai.text.core import _tokenize_files
@delegates(_tokenize_files)
def tokenize_folderOP(path, extensions=None, folders=None,
                      output_dir=None, skip_if_exists=True, **kwargs):
    """replacement of fastai.text.core.tokenize_folder so that it passes
     output_dir as input to _tokenize_files"""
    path,extensions = Path(path),ifnone(extensions, ['.txt'])
    files = get_files(path, extensions=extensions, recurse=True, folders=folders)
    def _f(i,output_dir): return output_dir/files[i].relative_to(path)
    return _tokenize_files(_f, files, path, output_dir, 
                           skip_if_exists=skip_if_exists, **kwargs)

class TokenizerOP(Tokenizer):
    """Wrapper class for fastai.text.core.Tokenizer class to override
    from_folder method to take output_dir as input and call the overridden
    method tokenize_folderOP"""
    def __init__(self, tok, rules=None, counter=None,
                 lengths=None, mode=None, sep=' '):
        super().__init__(tok, rules=rules, counter=counter,
                 lengths=lengths, mode=mode, sep=sep)
    
    @classmethod
    @delegates(tokenize_folderOP, keep=True)
    def from_folder(cls, path, tok=None, rules=None, output_dir = None, **kwargs):
        path = Path(path)
        if tok is None: tok = WordTokenizer()
        output_dir = tokenize_folderOP(path, tok=tok, rules=rules, 
                                       skip_if_exists = False, 
                                       output_dir = output_dir, **kwargs)
        res = cls(tok, counter=load_pickle(output_dir/fn_counter_pkl),
                  lengths=load_pickle(output_dir/fn_lengths_pkl),
                  rules=rules, mode='folder')
        res.path,res.output_dir = path,output_dir
        return res


class TextBlockOP(TextBlock):
    "Overriding TextBlock for user defined output dir"
    def __init__(self, tok_tfm, vocab=None, is_lm=False,
                 seq_len=72, backwards=False, **kwargs):
        super().__init__(tok_tfm, vocab=vocab, is_lm=is_lm,
                 seq_len=seq_len, backwards=backwards, **kwargs)
    @classmethod
    @delegates(TokenizerOP.from_folder, keep=True)
    def from_folder(cls, path, vocab=None, is_lm=False, seq_len=72,
                    backwards=False, min_freq=3, max_vocab=60000,output_dir =None,
                    **kwargs):
        "Build a `TextBlock` from a `path` - calls TokenizerOP and provides output_dir as input"
        return cls(TokenizerOP.from_folder(path, output_dir = output_dir, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,
                   backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)
        
class TextDataLoadersOP(TextDataLoaders):
    "Basic wrapper around several `DataLoader`s with factory methods for NLP problems"
    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_folder(cls, path, train='train', valid='valid', valid_pct=None,
                    seed=None, vocab=None, text_vocab=None, is_lm=False,
                    tok_tfm=None, seq_len=72, splitter=None,
                    backwards=False, output_dir =None, **kwargs):
        "This is to override same method from TextDataLoaders to accept and pass output_dir as input"
        if splitter is None:
            splitter = GrandparentSplitter(train_name=train,
                                           valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct,
                                                                                                      seed=seed)
        blocks = [TextBlockOP.from_folder(path, text_vocab, is_lm, seq_len, backwards,
                                          tok=tok_tfm, output_dir = output_dir)]
        if not is_lm: blocks.append(CategoryBlock(vocab=vocab))
        get_items = partial(get_text_files, folders=[train,valid]) if valid_pct is None else get_text_files
        dblock = DataBlock(blocks=blocks,
                           get_items=get_items,
                           splitter=splitter,
                           get_y=None if is_lm else parent_label)
        return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)


In [34]:
tok_path = model_save_path / 'train_tok'
tok_path.mkdir(parents=True, exist_ok=True)
dls_lm = TextDataLoadersOP.from_folder(path / 'train', is_lm=True,
                                       valid_pct=0.1, output_dir = tok_path)
# TextDataLoaders.from_folder(path / 'train', is_lm=True, valid_pct=0.1, )
dls_lm.show_batch(max_n=5)

Unnamed: 0,text,text_
0,xxbos xxmaj my principal says we all have to participate in at least one after school activity . i totally agree with him . xxmaj one of the reasons why is so we can make more friends and be more comfortable xxunk people . xxmaj the reason why we need to have friends and be more comfortable around people is because friends are always a important part of a persons life .,xxmaj my principal says we all have to participate in at least one after school activity . i totally agree with him . xxmaj one of the reasons why is so we can make more friends and be more comfortable xxunk people . xxmaj the reason why we need to have friends and be more comfortable around people is because friends are always a important part of a persons life . xxmaj
1,". \n\n xxmaj those are three reasons why i would talk to multiple different source 's and get there advice instead of just talking to only one source and getting a xxunk amount of advice compared to the amount of advice i would get talking to multiple source 's . xxbos xxmaj facial action coding xxunk should be used because it is a good way to help students and people , xxmaj","\n\n xxmaj those are three reasons why i would talk to multiple different source 's and get there advice instead of just talking to only one source and getting a xxunk amount of advice compared to the amount of advice i would get talking to multiple source 's . xxbos xxmaj facial action coding xxunk should be used because it is a good way to help students and people , xxmaj first"
2,i do nt want that in our country . xxmaj you may be thinking what do i mean by i do nt want that in our country what i mean by that is that if there is problems i do nt want that in our country cause are country is already free so i do nt want it to have problems . xxmaj and then if the xxmaj united xxmaj states fight,do nt want that in our country . xxmaj you may be thinking what do i mean by i do nt want that in our country what i mean by that is that if there is problems i do nt want that in our country cause are country is already free so i do nt want it to have problems . xxmaj and then if the xxmaj united xxmaj states fight it
3,not to your control . but i was hoping that everyone could be happy . but both does sound alot . if we ca n't do both we can at least do one that will benefit everyone . i know i am just a student but we can make a big difference in the community . we could probably make it better . you decide which one is better . and hopefully,to your control . but i was hoping that everyone could be happy . but both does sound alot . if we ca n't do both we can at least do one that will benefit everyone . i know i am just a student but we can make a big difference in the community . we could probably make it better . you decide which one is better . and hopefully let
4,"\n\n xxmaj next , the figure similar to a "" face "" is to big to be a real face . xxmaj for xxunk enormous head nearly two miles from end to end . xxunk ) xxmaj this quote shows that the head is to big to be a real face . xxmaj like , what face is two miles long from end to end ? \n\n xxmaj last but not least","xxmaj next , the figure similar to a "" face "" is to big to be a real face . xxmaj for xxunk enormous head nearly two miles from end to end . xxunk ) xxmaj this quote shows that the head is to big to be a real face . xxmaj like , what face is two miles long from end to end ? \n\n xxmaj last but not least ,"


In [35]:
# dump vocab that will be required for inference script
with open(model_save_path / 'dls_lm_vocab.pickle', 'wb') as b:
    pickle.dump(dls_lm.vocab,b)

In [36]:
learn_lm = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path=path, wd=0.1).to_fp16()

In [None]:
learn_lm.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time


In [None]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(10, 1e-3)

In [None]:
model_save_path.mkdir(parents=True, exist_ok=True)
if iskaggle:
    # hack to save encoder in a writable location
    learn_lm.path = model_save_path
    learn_lm.save_encoder('finetuned_enc')
    # learn_lm.save('finetuned_lm')
else:
    learn_lm.save_encoder(model_save_path / 'finetuned_enc')

## Text classification

In [None]:
def file_read(file_path):
    with open(file_path, 'r') as _f: 
        all_content = _f.read()
    return all_content

In [None]:
# pre-process text - add all columns 
df = pd.read_csv(path/'train.csv')
df['essay_text'] = df['essay_id'].apply(lambda x: file_read(path / 'train' / f'{x}.txt'))
df.head()

In [None]:
# decide sequence length
df['seq_length_essay'] = [len(txt.split()) for txt in df['essay_text'].tolist()]
df['seq_length_dis'] = [len(txt.split()) for txt in df['discourse_text'].tolist()]
df['seq_length_essay'].describe(), df['seq_length_dis'].describe()

In [None]:
# replace discorse text within the context text as __MASKED__
df['masked_ess_txt'] = df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
df['seq_length_mask_ess'] = [len(txt.split()) for txt in df['masked_ess_txt'].tolist()]
df['masked_ess_txt'].head(), df['seq_length_mask_ess'].describe()

In [None]:
# function to truncate discourse text and context text
# this is still in progress
def trunc_text(text, num_words, unique_centre_tok = None):
    w_l = text.split()
    if unique_centre_tok is None:
        if len(w_l) > num_words: w_l = w_l[:num_words]
    else:
        if len(w_l) > num_words:
            pos_tok = w_l.index(unique_centre_tok) + 1
            if pos_tok > round(len(w_l) / 2):
                start_pos = pos_tok - round(len(w_l) / 2) - 1 
                w_l = w_l[start_pos:(start_pos + num_words)]
                print(start_pos)
            else:
                w_l = w_l[:num_words]
            
    return ' '.join(w_l)

trunc_text("let's see where we are __MASKED__ going.", 5, unique_centre_tok = "__MASKED__")

In [None]:
# combine all text columns for classification 
# concat all
# df['all_text'] = 'CONTEXT: ' + df.essay_text + '; DISCOURSE: ' + df.discourse_text + '; TYPE: ' + df.discourse_type

# concat after truncation
df['essay_text_trunc'] = df.masked_ess_txt.apply(lambda t: trunc_text(t,512))
df['discourse_text_trunc'] = df.discourse_text.apply(lambda t: trunc_text(t,64))
df['all_text'] = 'CONTEXT: ' + df.essay_text_trunc + '; TYPE: ' + df.discourse_type + '; DISCOURSE: ' + df.discourse_text_trunc 
df['seq_length_all'] = [len(txt.split()) for txt in df['all_text'].tolist()]
df['seq_length_all'].describe()

In [None]:
# create data loaders
dls = TextDataLoaders.from_df(df, text_col='all_text',
                              label_col='discourse_effectiveness',
                              seq_len=650,
                              text_vocab=dls_lm.vocab)
dls.show_batch(max_n=3)

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, 
                                drop_mult=0.5,
                                backwards=True,
                                metrics=[accuracy,F1Score(average='weighted')]).to_fp16()
# load encoder from language model
if iskaggle: model_save_path = model_save_path / 'models'
learn = learn.load_encoder(model_save_path / 'finetuned_enc')

In [None]:
learn.lr_find(suggest_funcs=(valley, slide))

In [None]:
learn.fine_tune(10, 0.01)

In [None]:
model_save_path.mkdir(parents=True, exist_ok=True)
# learn.export(f'{model_save_path}/all_col_concat_learner.pkl')
learn.save(f'{model_save_path}/all_text_concat_ulmfit_save.pkl')

## Test submission

In [None]:
test_df = pd.read_csv(path/'test.csv')
# pre-process test df texts
test_df['essay_text'] = test_df['essay_id'].apply(lambda x: file_read(path / 'test' / f'{x}.txt'))
test_df['masked_ess_txt'] = test_df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
test_df['essay_text_trunc'] = test_df.masked_ess_txt.apply(lambda t: trunc_text(t,512))
test_df['discourse_text_trunc'] = test_df.discourse_text.apply(lambda t: trunc_text(t,64))
test_df['text'] = 'CONTEXT: ' + test_df.essay_text_trunc + '; TYPE: ' + test_df.discourse_type + '; DISCOURSE: ' + test_df.discourse_text_trunc 

In [None]:
# test_df.rename(columns = {'discourse_text':'text'}, inplace = True)
test_df.info()

In [None]:
tst_dl = dls.test_dl(test_df)
tst_dl.show_batch()

In [None]:
probs,_,idxs = learn.get_preds(dl=tst_dl, with_decoded=True)
probs

In [None]:
# column names for probabilities
probs_df = pd.DataFrame(probs.numpy(),columns = dls.vocab[1])
probs_df

In [None]:
probs_df["discourse_id"] = test_df["discourse_id"]
probs_df.to_csv('submission.csv', index=False)
!head submission.csv

In [None]:
# not working for this competetion
# if not iskaggle:
#     from kaggle import api
#     api.competition_submit_cli('submission.csv', 'initial', comp)

In [None]:
if not iskaggle:
    push_notebook('saan', comp,
                  title='Feedback effeciveness with essay text using ulmfit-backward',
                  file='/content/drive/MyDrive/Colab Notebooks/all_text_concat_ulmfit_model.ipynb',
                  competition=comp, private=False, gpu=True)