# **Getting the Dataset**

In [0]:
from fastai.text import *
import html

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!mkdir ~/data


In [0]:
#!cat /content/gdrive/My\ Drive/aclImdb_v1.tar.gz

#!mkdir ~/.data
!cp /content/gdrive/My\ Drive/aclImdb_v1.tar.gz ~/data/aclImdb_v1.tar.gz


In [0]:
ls ~/data


In [0]:
DATA_PATH = Path('data/')
DATA_PATH.mkdir(exist_ok = True)

In [0]:
!tar -xvf ~/data/aclImdb_v1.tar.gz -C {DATA_PATH}

In [0]:
PATH = Path('data/aclImdb/')

# **Standardize format**

In [0]:
CLASS_PATH = Path('data/imdb_clas/')
CLASS_PATH.mkdir(exist_ok = True)

LM_PATH = Path('data/imdb_lm/')
LM_PATH.mkdir(exist_ok = True)

In [0]:
CLASSES = ['neg', 'pos', 'unsup']

def get_texts(path):
  texts,labels = [],[]
  for idx,label in enumerate(CLASSES):
    for fname in (path/label).glob('*.*'):
      texts.append(fname.open('r', encoding = 'utf-8').read())
      labels.append(idx)
  
  return np.array(texts), np.array(labels)

trn_texts, trn_labels = get_texts(PATH/'train')
val_texts, val_labels = get_texts(PATH/'test')

In [0]:
len(trn_texts), len(val_texts)

In [0]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))

In [0]:
trn_texts = trn_texts[trn_idx]
val_texts = val_texts[val_idx]

trn_labels = trn_labels[trn_idx]
val_labels = val_labels[val_idx]

In [0]:
col_names = ['labels','text']

In [0]:
df_trn = pd.DataFrame({'text':trn_texts,'labels':trn_labels}, columns = col_names)
df_val = pd.DataFrame({'text':val_texts,'labels':val_labels}, columns = col_names)

In [0]:
for o in CLASSES:
  print (o)


In [0]:
df_trn[df_trn['labels'] != 2].to_csv(CLASS_PATH/'train.csv', header = False, index = 'False')
df_val.to_csv(CLASS_PATH/'test.csv', header = 'False', index = 'False')

(CLASS_PATH/'classes.txt').open('w', encoding = 'utf-8').writelines(f'{o}\n' for o in CLASSES)

In [0]:
! cat {CLASS_PATH}/'classes.txt'

In [0]:
from sklearn import model_selection

In [0]:
trn_texts, val_texts = model_selection.train_test_split(np.concatenate([trn_texts,val_texts]), test_size = 0.1)

In [0]:
len(trn_texts, val_texts)

In [0]:
df_trn = pd.DataFrame({'texts':trn_texts, 'labels':[0]*len(trn_texts)}, columns = column_names)
df_val = pd.DataFrame({'texts':val_texts, 'labels':[0]*len(val_texts)}, columns = column_names)

df_trn.to_csv(LM_PATH/'train.csv',header = False, index = False)
df_val.to_csv(LM_PATH/'test.csv', header = False, index = False)

# **Language model tokens**

In [0]:
chunksize=24000

In [0]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [0]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(fixup).values)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

In [0]:
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [0]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)

In [0]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

In [0]:
(LM_PATH/'tmp').mkdir(exist_ok=True)

In [0]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [0]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [0]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

In [0]:
max_vocab = 60000
min_freq = 2

In [0]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [0]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [0]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [0]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [0]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [0]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

# **wikitext103 conversion**

In [0]:
# ! wget -nH -r -np -P {PATH} http://files.fast.ai/models/wt103/

In [0]:
em_sz,nh,nl = 400,1150,3

In [0]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [0]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [0]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)

In [0]:
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [0]:
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [0]:
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

# Language model

In [0]:
wd=1e-7
bptt=70
bs=52
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [0]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [0]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [0]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [0]:
lr=1e-3
lrs = lr

In [0]:
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

In [0]:
learner.save('lm_last_ft')

In [0]:
learner.load('lm_last_ft')

In [0]:
learner.unfreeze()

In [0]:
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [0]:
learner.sched.plot()

In [0]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=15)

In [0]:
learner.save('lm1')

In [0]:
learner.save_encoder('lm1_enc')

In [0]:
learner.sched.plot_loss()