# imports

In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
#import fastText
#from fastText import load_model
import gc
import re
tqdm.pandas()
from gensim.models import KeyedVectors
from fastprogress import master_bar, progress_bar
from pathlib import Path
from collections import defaultdict
from collections import Iterable
import math
from torch import LongTensor,Tensor
from torch.nn import CrossEntropyLoss
#from fastai import *
import warnings
from typing import *
from dataclasses import dataclass,field
from torch import optim
import os
from abc import abstractmethod
from functools import partial
import torch.nn.functional as F

In [2]:
Model = nn.Module
Floats = Union[float, Collection[float]]
ArgStar = Collection[Any]
Tensors = Union[Tensor, Collection['Tensors']]
Rank0Tensor = NewType('OneEltTensor', Tensor)
PathOrStr = Union[Path,str]
TItem = TypeVar('TItem')
TfmCallable = Callable[[TItem],TItem]
TfmList = Union[TfmCallable, Collection[TfmCallable]]
ModuleList = Collection[nn.Module]
SplitFuncOrIdxList = Union[Callable, Collection[ModuleList]]
class ItemBase():
    "All transformable dataset items use this type"
    @property
    @abstractmethod
    def device(self): pass
    @property
    @abstractmethod
    def data(self): pass

ItemsList = Collection[Union[Tensor,ItemBase,'ItemsList',float,int]]
def num_cpus()->int:
    "Get number of cpus"
    try:                   return len(os.sched_getaffinity(0))
    except AttributeError: return os.cpu_count()
default_cpus = min(16, num_cpus())
StartOptEnd=Union[float,Tuple[float,float]]
bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
ModuleList = Collection[nn.Module]
ParamList = Collection[nn.Parameter]
def is_tuple(x:Any)->bool: return isinstance(x, tuple)

# prep data

In [3]:
df = pd.read_csv('train.csv')

In [4]:
pd.set_option('display.max_colwidth', -1)

In [5]:
df.head(2)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK


In [6]:
df['world'].unique()

array(['NONE', 'MAGMAPEAK', 'TREETOPCITY', 'CRYSTALCAVES'], dtype=object)

In [7]:
df_ttop_cty = df.query('world== "TREETOPCITY"')

In [8]:
df_cryst_rulz = df_ttop_cty[df_ttop_cty['title'] == 'Crystals Rule']

In [9]:
vocab = df_cryst_rulz.event_code.unique()
vocab.sort()

In [10]:
len(vocab)

15

In [11]:
vocab

array([2000, 2010, 2020, 2030, 3010, 3020, 3021, 3110, 3120, 3121, 4010,
       4020, 4050, 4070, 4090])

In [12]:
vocab_ = df.event_code.unique()

In [13]:
len(vocab_)

42

In [14]:
vocab_

array([2000, 3010, 3110, 4070, 4090, 4030, 4035, 4021, 4020, 4010, 2080,
       2083, 2040, 2020, 2030, 3021, 3121, 2050, 3020, 3120, 2060, 2070,
       4031, 4025, 5000, 5010, 2081, 2025, 4022, 2035, 4040, 4100, 2010,
       4110, 4045, 4095, 4220, 2075, 4230, 4235, 4080, 4050])

In [15]:
grps = df_cryst_rulz.groupby("game_session")
len(grps)

3145

In [16]:
codes_by_session = defaultdict(lambda : [])
for name,grp in grps:
    grp_ = grp.sort_values(by='event_count').reset_index()
    codes_by_session[name] = grp_['event_code'].values

In [17]:
len(codes_by_session)

3145

In [18]:
data = []
for key,value in codes_by_session.items():
    dat = np.array([9998])
    dat = np.concatenate((dat,value))
    dat = np.append(dat,9999)
    data.append(dat)

In [19]:
len(data)

3145

In [20]:
vocab2index = defaultdict(lambda : -1)

In [21]:
for i,code in enumerate(vocab):
    vocab2index[code] = i

In [22]:
vocab2index[9998] = 15 # start of events id
vocab2index[9999] = 16 # end of event ids

In [23]:
index2vocab = { i:v for v,i in vocab2index.items()}

###### lens = []
for arr in data:
    lens.append(len(arr))

###### min(lens)

###### data_pad = []
for arr in data:
    len_ = len(arr)
    if len_<2376:
        append_sz = 2376-len_
        z = np.zeros(append_sz)
        zz = np.append(arr,z)
        data_pad.append(zz)
    elif len_==2376:
        data_pad.append(arr)
    else:
        print("error")

In [24]:
len(data)

3145

In [26]:
data[0]

array([9998, 2000, 4010, 3010, 3110, 2020, 3010, 3110, 4050, 4070, 4020,
       3010, 3110, 3010, 3110, 3010, 3110, 3021, 3121, 2030, 2020, 3010,
       3110, 4020, 3010, 3110, 3010, 3110, 3010, 3110, 3020, 4070, 3120,
       4020, 3010, 3110, 3010, 3110, 3021, 3121, 2030, 2020, 3010, 3110,
       4020, 3010, 3110, 3010, 3110, 3021, 3121, 2030, 2020, 3010, 3110,
       4020, 3010, 3110, 3010, 3110, 3010, 3110, 3021, 3121, 2030, 2020,
       3010, 4070, 3110, 4020, 3010, 3110, 3010, 3110, 3010, 3110, 3021,
       3121, 2030, 2020, 3010, 3110, 4020, 3010, 3110, 3010, 3110, 3010,
       3110, 3010, 3110, 3020, 4070, 4070, 4070, 4070, 3120, 4020, 3010,
       4070, 3110, 3010, 3110, 3010, 3110, 3021, 3121, 2030, 2020, 3010,
       3110, 4020, 3010, 3110, 3010, 3110, 3010, 3110, 3010, 3110, 3020,
       4070, 3120, 4020, 3010, 3110, 3010, 3110, 3010, 3110, 3010, 3110,
       3010, 3110, 3021, 3121, 2030, 2020, 3010, 4070, 3110, 4020, 3010,
       4070, 3110, 3010, 3110, 3010, 3110, 3010, 31

In [27]:
#vocab2index[0] = 0
#index2vocab[0] = 0

In [28]:
data_ind = []
for i in range(len(data)):
    dat = []
    for val in data[i]:   
        dat.append(vocab2index[int(val)])
    data_ind.append(dat)

In [29]:
len(data),len(data_ind)

(3145, 3145)

In [30]:
rnd_ind = np.random.permutation(range(len(data)))

In [31]:
cut = int(0.75*len(data))
rnd_ind_trn = rnd_ind[:cut]
rnd_ind_tst = rnd_ind[cut:]

In [32]:
data_ind = np.array(data_ind)

In [33]:
data_trn = data_ind[rnd_ind_trn]
data_tst = data_ind[rnd_ind_tst]
len(data_trn),len(data_tst)

(2358, 787)

In [34]:
data_trn_ = np.concatenate(data_trn)
data_tst_ = np.concatenate(data_tst)

In [35]:
len(data_trn_),len(data_tst_)

(343425, 116717)

# code

## data

In [36]:
class LanguageModelLoader():
    "Creates a dataloader with bptt slightly changing."
    
    def __init__(self, nums:np.ndarray, bs:int=64, bptt:int=70, backwards:bool=False):
        self.bs,self.bptt,self.backwards = bs,bptt,backwards
        self.data = self.batchify(nums)
        self.first,self.i,self.iter = True,0,0
        self.n = len(self.data)

    def __iter__(self):
        self.i,self.iter = 0,0
        while self.i < self.n-1 and self.iter<len(self):
            if self.first and self.i == 0: self.first,seq_len = False,self.bptt + 25
            else:
                bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
                seq_len = max(5, int(np.random.normal(bptt, 5)))
            res = self.get_batch(self.i, seq_len)
            self.i += seq_len
            self.iter += 1
            yield res

    def __len__(self) -> int: return (self.n-1) // self.bptt

    def batchify(self, data:np.ndarray) -> LongTensor:
        "Splits the data in batches."
        print(data.shape)
        nb = data.shape[0] // self.bs
        data = np.array(data[:nb*self.bs]).reshape(self.bs, -1).T
        print(data.shape)
        if self.backwards: data=data[::-1]
        return LongTensor(data)

    def get_batch(self, i:int, seq_len:int) -> LongTensor:
        "Gets a batch of length `seq_len`"
        seq_len = min(seq_len, len(self.data) - 1 - i)
        return self.data[i:i+seq_len], self.data[i+1:i+1+seq_len].contiguous().view(-1)

In [37]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        embed_size = 50
        lstm_hidden_size = 120
        gru_hidden_size = 60
        #self.gru_hidden_size = gru_hidden_size
        self.embedding = nn.Embedding(17, embed_size)
        #self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        #self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, lstm_hidden_size, bidirectional=False, batch_first=False)
        #self.gru = nn.GRU(lstm_hidden_size*2, gru_hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(lstm_hidden_size, 60)
        self.relu = nn.ReLU()
        #self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(60, 17)
    def forward(self, x):
        h_embedding = self.embedding(x)
        #h_embedding = torch.unsqueeze(h_embedding.transpose(1, 2), 2)
        #h_embedding = torch.squeeze(self.embedding_dropout(h_embedding)).transpose(1, 2)
        h_lstm, _ = self.lstm(h_embedding)
        #h_gru, hh_gru = self.gru(h_lstm)
        #hh_gru = hh_gru.view(-1, self.gru_hidden_size*2)
        #avg_pool = torch.mean(h_gru, 1)
        #max_pool, _ = torch.max(h_gru, 1)
        #conc = torch.cat((hh_gru, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(h_lstm))
        #conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [38]:
default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [39]:
class Callback():
    _order = 0
    def set_runner(self, run): self.run = run
    def __getattr__(self, k): return getattr(self.run, k)
    @property
    def name(self):
        name = re.sub(r'Callback$', '', self.__class__.__name__)
        return camel2snake(name or 'callback')
    def __call__(self, cb_name):
        f = getattr(self, cb_name, None)
        if f and f(): return True
        return False
class TrainEvalCallback(Callback):
    def begin_fit(self):
        self.run.n_epochs = 0
        self.run.n_iter = 0
    def after_batch(self):
        if not self.in_train: return
        self.run.n_epochs+=1./self.iters
        self.run.n_iter+=1
    def begin_epoch(self):
        self.run.n_epochs = self.epoch
        self.model.train()
        self.run.in_train = True
    def begin_validate(self):
        self.model.eval()
        self.run.in_train = False

class CancelTrainException(Exception): pass
class CancelEpochException(Exception): pass
class CancelBatchException(Exception): pass

ListOrItem = Union[Collection[Any],int,float,str]
OptListOrItem = Optional[ListOrItem]
def listify(p:OptListOrItem=None, q:OptListOrItem=None):
    "Makes `p` same length as `q`"
    if p is None: p=[]
    elif not isinstance(p, Iterable): p=[p]
    n = q if type(q)==int else len(p) if q is None else len(q)
    if len(p)==1: p = p * n
    assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'
    return list(p)
class Runner():
    def __init__(self, cbs=None, cb_funcs=None):
        cbs = listify(cbs)
        for cbf in listify(cb_funcs):
            cb = cbf()
            setattr(self, cb.name, cb)
            cbs.append(cb)
        self.stop, self.cbs = False, [TrainEvalCallback()] + cbs
    @property
    def opt(self): return self.learn.opt
    @property
    def model(self): return self.learn.model
    @property
    def loss_func(self): return self.learn.loss_func
    @property
    def data(self): return self.learn.data
    
    def one_batch(self, xb, yb):
        try: 
            self.xb, self.yb = xb, yb
            self('begin_batch')
            self.pred = self.model(self.xb)
            self('after_pred')
            self.loss = self.loss_func(self.pred, self.yb)
            self('after_loss')
            if not self.in_train: return
            self.loss.backward()
            self('after_backward')
            self.opt.step()
            self('after_step')
            self.opt.zero_grad()
        except CancelBatchException: self('after_cancel_batch')
        finally: self('after_batch')
    
    def all_batches(self, dl):
        self.iters = len(dl)
        try:
            for xb, yb in progress_bar(dl, leave=False): self.one_batch(xb, yb)
        except CancelEpochException: self('after_cancel_epoch')
    def fit(self, epochs, learn):
        self.epochs, self.learn, self.loss = epochs, learn, torch.tensor(0.)
        try: 
            for cb in self.cbs: cb.set_runner(self)
            self('begin_fit')
            for epoch in range(epochs):
                self.epoch = epoch
                if not self('begin_epoch'): self.all_batches(self.data.train_dl)
                with torch.no_grad():
                    if not self('begin_validate'): self.all_batches(self.data.valid_dl)
                self('after_epoch')
        except CancelTrainException: self('after_cancel_train')
        finally:
            self('after_fit')
            self.learn = None
    def __call__(self, cb_name):
        res = False
        for cb in sorted(self.cbs, key=lambda x: x._order): res = cb(cb_name) and res
        return res
#class Learner():
#    def __init__(self, model, opt, loss_func, data):
#        self.model, self.opt, self.loss_func, self.data = model, opt, loss_func, data
def get_model(data, lr=0.005):
    # ravi error
    model = Net(embedding_matrix).to(device)
    return model, torch.optim.Adam(model.parameters(), lr)
#class DataBunch():
#    def __init__(self, train_dl, valid_dl):
#        self.train_dl, self.valid_dl = train_dl, valid_dl
#    @property
#    def train_ds(self): return self.train_dl.dataset
#    
#    @property
#    def valid_ds(self): return self.valid_dl.dataset
def data_collate(batch:ItemsList)->Tensor:
    "Convert `batch` items to tensor data"
    return torch.utils.data.dataloader.default_collate(to_data(batch))

@dataclass
class DeviceDataLoader():
    "Binds a `DataLoader` to a `torch.device`"
    dl: DataLoader
    device: torch.device
    tfms: List[Callable]=None
    collate_fn: Callable=data_collate
    def __post_init__(self):
        self.dl.collate_fn=self.collate_fn # are we using this?
        self.tfms = listify(self.tfms)

    def __len__(self)->int: return len(self.dl)
    def __getattr__(self,k:str)->Any: return getattr(self.dl, k)

    def add_tfm(self,tfm:Callable)->None:    self.tfms.append(tfm)
    def remove_tfm(self,tfm:Callable)->None: self.tfms.remove(tfm)

    def proc_batch(self,b:Tensor)->Tensor:
        "Proces batch `b` of `TensorImage`"
        b = to_device(b, self.device)
        for f in listify(self.tfms): b = f(b)
        return b

    def __iter__(self):
        "Process and returns items from `DataLoader`"
        self.gen = map(self.proc_batch, self.dl)
        return iter(self.gen)

    @classmethod
    def create(cls, dataset:Dataset, bs:int=1, shuffle:bool=False, device:torch.device=default_device,
               tfms:TfmList=tfms, num_workers:int=default_cpus, collate_fn:Callable=data_collate, **kwargs:Any):
        "Create DeviceDataLoader from `dataset` with `batch_size` and `shuffle`: processs using `num_workers`"
        return cls(DataLoader(dataset, batch_size=bs, shuffle=shuffle, num_workers=num_workers, **kwargs),
                   device=device, tfms=tfms, collate_fn=collate_fn)

class DataBunch():
    "Bind `train_dl`,`valid_dl` and`test_dl` to `device`. tfms are DL tfms (normalize). `path` is for models."
    def __init__(self, train_dl:DataLoader, valid_dl:DataLoader, test_dl:Optional[DataLoader]=None,
                 device:torch.device=None, tfms:Optional[Collection[Callable]]=None, path:PathOrStr='.'):
        "Bind `train_dl`,`valid_dl` and`test_dl` to `device`. tfms are DL tfms (normalize). `path` is for models."
        self.device = default_device if device is None else device
        self.train_dl = DeviceDataLoader(train_dl, self.device, tfms=tfms)
        self.valid_dl = DeviceDataLoader(valid_dl, self.device, tfms=tfms)
        self.test_dl  = DeviceDataLoader(test_dl,  self.device, tfms=tfms) if test_dl else None
        self.path = Path(path)

    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None,
               path='.', bs=64, ds_tfms=None, num_workers=default_cpus,
               tfms=None, device=None, size=None, **kwargs)->'DataBunch':
        "`DataBunch` factory. `bs` batch size, `ds_tfms` for `Dataset`, `tfms` for `DataLoader`"
        datasets = [train_ds,valid_ds]
        if test_ds is not None: datasets.append(test_ds)
        if ds_tfms: datasets = transform_datasets(*datasets, tfms=ds_tfms, size=size, **kwargs)
        dls = [DataLoader(*o, num_workers=num_workers) for o in
               zip(datasets, (bs,bs*2,bs*2), (True,False,False))]
        return cls(*dls, path=path, device=device, tfms=tfms)

    def __getattr__(self,k)->Any: return getattr(self.train_ds, k)
    def holdout(self, is_test:bool=False)->DeviceDataLoader:
        "Returns correct holdout `Dataset` for test vs validation (`is_test`)"
        return self.test_dl if is_test else self.valid_dl

    @property
    def train_ds(self)->Dataset: return self.train_dl.dl.dataset
    @property
    def valid_ds(self)->Dataset: return self.valid_dl.dl.dataset
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AvgStats():
    def __init__(self, metrics, in_train): self.metrics, self.in_train = listify(metrics), in_train
    def reset(self):
        self.tot_loss, self.count = 0., 0
        self.tot_mets = [0.]*len(self.metrics)
    @property
    def all_stats(self): return [self.tot_loss.item()] + self.tot_mets
    @property
    def avg_stats(self): return [o/self.count for o in self.all_stats]
    
    def __repr__(self):
        if not self.count: return ''
        return f"{'train' if self.in_train else 'valid'}: {self.avg_stats}"
    def accumulate(self, run):
        bn = run.xb.shape[0]
        self.tot_loss+=run.loss*bn
        self.count+=bn
        for i, m in enumerate(self.metrics):
            self.tot_mets[i]+=m(run.pred, run.yb)*bn
class AvgStatsCallBack(Callback):
    def __init__(self, metrics):
        self.train_stats, self.valid_stats = AvgStats(metrics, True), AvgStats(metrics, False)
    def begin_epoch(self):
        self.train_stats.reset()
        self.valid_stats.reset()
    def after_loss(self):
        stats = self.train_stats if self.in_train else self.valid_stats
        with torch.no_grad(): stats.accumulate(self.run)
    def after_epoch(self):
        print(self.train_stats)
        print(self.valid_stats)
class Recorder(Callback):
    def begin_fit(self):
        self.lrs = [[] for _ in self.opt.param_groups]
        self.losses = []
    def after_batch(self):
        if not self.in_train: return
        for pg, lr in zip(self.opt.param_groups, self.lrs): lr.append(pg['lr'])
        self.losses.append(self.loss.detach().cpu())
    
    def plot_lr(self, pgid=-1): plt.plot(self.lrs[pgid])
    def plot_loss(self, skip_last=0): plt.plot(self.losse[:len(self.losses)-skip_last])
    def plot(self, skip_last=0, pgid=-1):
        losses = [o.item() for o in self.losses]
        lrs = self.lrs[pgid]
        n = len(losses)-skip_last
        plt.xscale('log')
        plt.plot(lrs[:n], losses[:n])
        
        
def roc(out, y):
    score = roc_auc_score(y.cpu().detach().numpy(), out.cpu().detach().numpy())
    return score
class ParamScheduler(Callback):
    _order = 1
    def __init__(self, pname, sched_funcs): self.pname, self.sched_funcs = pname, sched_funcs
    def begin_fit(self):
        if not isinstance(self.sched_funcs, (list, tuple)):
            self.sched_funcs = [self.sched_funcs]*len(self.opt.param_groups)
    def set_param(self):
        assert len(self.opt.param_groups)==len(self.sched_funcs)
        for pg, f in zip(self.opt.param_groups, self.sched_funcs):
            pg[self.pname] = f(self.n_epochs/self.epochs)
    def begin_batch(self):
        if self.in_train: self.set_param()
class LR_Find(Callback):
    _order = 1
    def __init__(self, max_iter=100, min_lr=1e-6, max_lr=10):
        self.max_iter, self.min_lr, self.max_lr = max_iter, min_lr, max_lr
        self.best_loss = 1e9
    def begin_batch(self):
        if not self.in_train: return
        pos = self.n_iter/self.max_iter
        lr = self.min_lr*(self.max_lr/self.min_lr)**pos
        for pg in self.opt.param_groups: pg['lr'] = lr
    def after_step(self):
        if self.n_iter>=self.max_iter or self.loss>self.best_loss*10:
            raise CancelTrainException()
        if self.loss<self.best_loss: self.best_loss = self.loss

In [40]:
def annealer(f):
    def _inner(start, end): return partial(f, start, end)
    return _inner
@annealer
def sched_lin(start, end, pos): return start + pos*(end-start)
@annealer
def sched_cos(start, end, pos): return start + (1+math.cos(math.pi*(1-pos)))*(end-start)/2

@annealer
def sched_no(start, end, pos): return start

@annealer
def sched_exp(start, end, pos): return start*(end/start)**pos

def combine_scheds(pcts, scheds):
    assert sum(pcts)==1.
    pcts = torch.tensor([0] + listify(pcts))
    assert torch.all(pcts>=0)
    pcts = torch.cumsum(pcts, 0)
    def _inner(pos):
        idx = (pos>=pcts).nonzero().max()
        actual_pos = (pos-pcts[idx])/(pcts[idx+1]-pcts[idx])
        return scheds[idx](actual_pos)
    return _inner

## model

In [39]:
flatten_model=lambda l: sum(map(flatten_model,l.children()),[]) if num_children(l) else [l]
def children(m:nn.Module)->ModuleList:
    "Get children of module"
    return list(m.children())
def num_children(m:nn.Module)->int:
    "Get number of child modules in module"
    return len(children(m))
def ifnone(a:bool,b:Any):
    "`a` if its not None, otherwise `b`"
    return b if a is None else a
def is_listy(x:Any)->bool: return isinstance(x, (tuple,list))
def to_device(b:Tensors, device:torch.device):
    "Ensure `b` is on `device`"
    device = ifnone(device, default_device)
    if is_listy(b): return [to_device(o, device) for o in b]
    return b.to(device)


AdamW = partial(optim.Adam, betas=(0.9,0.99)) 
default_lr = slice(3e-3)
default_wd = 1e-2
@dataclass
class Learner():
    "Object that wraps together some data, a model, a loss function and an optimizer"
    data:DataBunch
    model:nn.Module
    opt_fn:Callable=AdamW
    loss_fn:Callable=F.cross_entropy
    metrics:Collection[Callable]=None
    true_wd:bool=True
    bn_wd:bool=True
    wd:Floats=default_wd
    train_bn:bool=True
    path:str = None
    model_dir:str = 'models'
    callback_fns:Collection[Callable]=None
    callbacks:Collection[Callback]=field(default_factory=list)
    layer_groups:Collection[nn.Module]=None
    def __post_init__(self)->None:
        "Setup path,metrics, callbacks and ensure model directory exists"
        self.path = Path(ifnone(self.path, self.data.path))
        (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)
        self.model = self.model.to(self.data.device)
        self.metrics=listify(self.metrics)
        if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))]
        self.callbacks = listify(self.callbacks)
        self.callback_fns = [Recorder] + listify(self.callback_fns)

    def lr_range(self, lr:Union[float,slice])->np.ndarray:
        "Build learning rate schedule"
        if not isinstance(lr,slice): return lr
        if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups))
        else: res = [lr.stop/3]*(len(self.layer_groups)-1) + [lr.stop]
        return np.array(res)

    def fit(self, epochs:int, lr:Union[Floats,slice]=default_lr,
            wd:Floats=None, callbacks:Collection[Callback]=None)->None:
        "fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`"
        lr = self.lr_range(lr)
        if wd is None: wd = self.wd
        self.create_opt(lr, wd)
        callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
        fit(epochs, self.model, self.loss_fn, opt=self.opt, data=self.data, metrics=self.metrics,
            callbacks=self.callbacks+callbacks)

    def create_opt(self, lr:Floats, wd:Floats=0.)->None:
        "create optimizer with `lr` learning rate and `wd` weight decay"
        self.opt = OptimWrapper.create(self.opt_fn, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)

    def split(self, split_on:SplitFuncOrIdxList)->None:
        "split the model at `split_on`"
        if isinstance(split_on,Callable): self.layer_groups = split_on(self.model)
        else: self.layer_groups = split_model(self.model, split_on)

    def freeze_to(self, n:int)->None:
        "freeze layers up to layer `n`"
        for g in self.layer_groups[:n]:
            for l in g:
                if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False)
        for g in self.layer_groups[n:]: requires_grad(g, True)

    def freeze(self)->None:
        "freeze up to last layer"
        assert(len(self.layer_groups)>1)
        self.freeze_to(-1)

    def unfreeze(self):
        "unfreeze entire model"
        self.freeze_to(0)
    def __del__(self): del(self.model, self.data)
    def save(self, name:PathOrStr):
        "save model with `name` to `self.model_dir`"
        torch.save(self.model.state_dict(), self.path/self.model_dir/f'{name}.pth')
    def load(self, name:PathOrStr):
        "load model `name` from `self.model_dir"
        self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth'))

def fit_one_cycle(learn:Learner, cyc_len:int,
                  max_lr:Union[Floats,slice]=default_lr, moms:Tuple[float,float]=(0.95,0.85),
                  div_factor:float=25., pct_start:float=0.3, wd:float=None, **kwargs)->None:
    "Fits a model following the 1cycle policy"
    max_lr = learn.lr_range(max_lr)
    cbs = [OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor,
                             pct_start=pct_start, **kwargs)]
    learn.fit(cyc_len, max_lr, wd=wd, callbacks=cbs)


def lr_find(learn:Learner, start_lr:float=1e-5, end_lr:float=10, num_it:int=100, **kwargs:Any):
    "Explore lr from `start_lr` to `end_lr` over `num_it` iterations of `learn`"
    cb = LRFinder(learn, start_lr, end_lr, num_it)
    a = int(np.ceil(num_it/len(learn.data.train_dl)))
    learn.fit(a, start_lr, callbacks=[cb], **kwargs)
    
Learner.fit_one_cycle = fit_one_cycle
Learner.lr_find = lr_find
        
def dropout_mask(x:Tensor, sz:Collection[int], p:float):
    "Returns a dropout mask of the same type as x, size sz, with probability p to cancel an element."
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

class RNNDropout(nn.Module):
    "Dropout that is consistent on the seq_len dimension"
    
    def __init__(self, p:float=0.5):
        super().__init__()
        self.p=p

    def forward(self, x:Tensor) -> Tensor:
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (1, x.size(1), x.size(2)), self.p)
        return x * m
    
class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."
    
    def __init__(self, module:Model, weight_p:float, layer_names:Collection[str]=['weight_hh_l0']):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
    
    def _setweights(self):
        "Applies dropout to the raw weights"
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)
            
    def forward(self, *args:ArgStar):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)
    
    def reset(self):
        if hasattr(self.module, 'reset'): self.module.reset()
            
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    
    def __init__(self, emb:Model, embed_p:float):
        super().__init__()
        self.emb,self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words:LongTensor, scale:Optional[float]=None) -> Tensor:
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)
    
def repackage_var(h:Tensors) -> Tensors:
    "Detaches h from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(repackage_var(v) for v in h)

class RNNCore(nn.Module):
    "AWD-LSTM/QRNN inspired by https://arxiv.org/abs/1708.02182"

    initrange=0.1

    def __init__(self, vocab_sz:int, emb_sz:int, n_hid:int, n_layers:int, pad_token:int, bidir:bool=False,
                 hidden_p:float=0.2, input_p:float=0.6, embed_p:float=0.1, weight_p:float=0.5, qrnn:bool=False):
        
        super().__init__()
        self.bs,self.qrnn,self.ndir = 1, qrnn,(2 if bidir else 1)
        self.emb_sz,self.n_hid,self.n_layers = emb_sz,n_hid,n_layers
        self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.encoder_dp = EmbeddingDropout(self.encoder, embed_p)
        if self.qrnn:
            #Using QRNN requires cupy: https://github.com/cupy/cupy
            from qrnn import QRNNLayer
            self.rnns = [QRNNLayer(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                                   save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True, 
                                   use_cuda=torch.cuda.is_available()) for l in range(n_layers)]
            if weight_p != 0.:
                for rnn in self.rnns:
                    rnn.linear = WeightDropout(rnn.linear, weight_p, layer_names=['weight'])
        else:
            self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                1, bidirectional=bidir) for l in range(n_layers)]
            if weight_p != 0.: self.rnns = [WeightDropout(rnn, weight_p) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input:LongTensor) -> Tuple[Tensor,Tensor]:
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.encoder_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = repackage_var(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l:int) -> Tensor:
        "Returns one hidden state"
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        "Resets the hidden states"
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.n_layers)]
            
class LinearDecoder(nn.Module):
    "To go on top of a RNN_Core module"
    
    initrange=0.1
    
    def __init__(self, n_out:int, n_hid:int, output_p:float, tie_encoder:Model=None, bias:bool=True):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.output_dp = RNNDropout(output_p)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input:Tuple[Tensor,Tensor]) -> Tuple[Tensor,Tensor,Tensor]:
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1])
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [40]:
class SequentialRNN(nn.Sequential):
    "A sequential module that passes the reset call to its children."
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

In [41]:
def get_language_model(vocab_sz:int, emb_sz:int, n_hid:int, n_layers:int, pad_token:int, tie_weights:bool=True, 
                       qrnn:bool=False, bias:bool=True, output_p:float=0.4, hidden_p:float=0.2, input_p:float=0.6, 
                       embed_p:float=0.1, weight_p:float=0.5) -> Model:
    "To create a full AWD-LSTM"
    rnn_enc = RNNCore(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token, qrnn=qrnn,
                 hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.encoder if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [42]:
@dataclass
class GradientClipping(Callback):
    "To do gradient clipping during training."
    learn:Learner
    clip:float

    def on_backward_end(self, **kwargs):
        if self.clip:  nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip)
            
@dataclass
class RNNTrainer(Callback):
    "`Callback` that regroups lr adjustment to seq_len, AR and TAR"
    learn:Learner
    bptt:int
    alpha:float=0.
    beta:float=0.
    adjust:bool=True
    
    def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs):
        #Save the extra outputs for later and only returns the true output.
        self.raw_out,self.out = last_output[1],last_output[2]
        return last_output[0]
    
    def on_backward_begin(self, last_loss:Rank0Tensor, last_input:Tensor, last_output:Tensor, **kwargs):
        #Adjusts the lr to the bptt selected
        if self.adjust: self.learn.opt.lr *= last_input.size(0) / self.bptt
        #AR and TAR
        if self.alpha != 0.:  last_loss += (self.alpha * self.out[-1].pow(2).mean()).sum()
        if self.beta != 0.:
            h = self.raw_out[-1]
            if len(h)>1: last_loss += (self.beta * (h[1:] - h[:-1]).pow(2).mean()).sum()
        return last_loss
    
@dataclass
class OneCycleScheduler(Callback):
    "Manages 1-Cycle style traing as outlined in Leslie Smith's [paper](https://arxiv.org/pdf/1803.09820.pdf)"
    learn:Learner
    lr_max:float
    moms:Floats=(0.95,0.85)
    div_factor:float=25.
    pct_start:float=0.5

    def __post_init__(self): self.moms=tuple(listify(self.moms,2))

    def steps(self, *steps_cfg:StartOptEnd):
        "Build anneal schedule for all of the parameters"
        return [Stepper(step, n_iter, func=func)
                for (step,(n_iter,func)) in zip(steps_cfg, self.phases)]

    def on_train_begin(self, n_epochs:int, **kwargs:Any)->None:
        "Initialize our optimization params based on our annealing schedule"
        n = len(self.learn.data.train_dl) * n_epochs
        a1 = int(n * self.pct_start)
        a2 = n-a1
        self.phases = ((a1, annealing_linear), (a2, annealing_cos))
        low_lr = self.lr_max/self.div_factor
        self.lr_scheds = self.steps((low_lr, self.lr_max), (self.lr_max, low_lr/1e4))
        self.mom_scheds = self.steps(self.moms, (self.moms[1], self.moms[0]))
        self.opt = self.learn.opt
        self.opt.lr,self.opt.mom = self.lr_scheds[0].start,self.mom_scheds[0].start
        self.idx_s = 0

    def on_batch_end(self, **kwargs:Any)->None:
        "Take one step forward on the annealing schedule for the optim params"
        if self.idx_s >= len(self.lr_scheds): return Trrue
        self.opt.lr = self.lr_scheds[self.idx_s].step()
        self.opt.mom = self.mom_scheds[self.idx_s].step()
        # when the current schedule is complete we move onto the next
        # schedule. (in 1-cycle there are two schedules)
        if self.lr_scheds[self.idx_s].is_done:
            self.idx_s += 1

def split_bn_bias(layer_groups:ModuleList)->ModuleList:
    "Sort each layer in  `layer_groups` into batchnorm (`bn_types`) and non-batchnorm groups"
    split_groups = []
    for l in layer_groups:
        l1,l2 = [],[]
        for c in l.children():
            if isinstance(c, bn_types): l2.append(c)
            else:                       l1.append(c)
        split_groups += [nn.Sequential(*l1), nn.Sequential(*l2)]
    return split_groups
def trainable_params(m:nn.Module)->ParamList:
    "Return list of trainable params in `m`"
    res = filter(lambda p: p.requires_grad, m.parameters())
    return res
class OptimWrapper():
    "Basic wrapper around an optimizer to simplify HP changes"
    def __init__(self, opt:optim.Optimizer, wd:Floats=0., true_wd:bool=False, bn_wd:bool=True)->None:
        self.opt,self.true_wd,self.bn_wd = opt,true_wd,bn_wd
        self.opt_keys = list(self.opt.param_groups[0].keys())
        self.opt_keys.remove('params')
        self.read_defaults()
        self.wd = wd

    @classmethod
    def create(cls, opt_fn:Union[type,Callable], lr:Union[float,Tuple,List],
               layer_groups:ModuleList, **kwargs:Any)->optim.Optimizer:
        "Create an optim.Optimizer from `opt_fn` with `lr`. Set lr on `layer_groups``"
        split_groups = split_bn_bias(layer_groups)
        opt = opt_fn([{'params': trainable_params(l), 'lr':0} for l in split_groups])
        opt = cls(opt, **kwargs)
        opt.lr = listify(lr, layer_groups)
        return opt

    def __repr__(self)->str:
        return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}'

    #Pytorch optimizer methods
    def step(self)->None:
        "Set weight decay and step optimizer"
        # weight decay outside of optimizer step (AdamW)
        if self.true_wd:
            for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
                for p in pg1['params']: p.data.mul_(1 - wd*lr)
                if self.bn_wd:
                    for p in pg2['params']: p.data.mul_(1 - wd*lr)
            self.set_val('weight_decay', listify(0, self._wd))
        self.opt.step()

    def zero_grad(self)->None:
        "Clear optimizer gradients"
        self.opt.zero_grad()

    #Hyperparameters as properties
    @property
    def lr(self)->float:
        "Get learning rate"
        return self._lr[-1]

    @lr.setter
    def lr(self, val:float)->None:
        "Set learning rate"
        self._lr = self.set_val('lr', listify(val, self._lr))

    @property
    def mom(self)->float:
        "Get momentum"
        return self._mom[-1]

    @mom.setter
    def mom(self, val:float)->None:
        "Set momentum"
        if 'momentum' in self.opt_keys: self.set_val('momentum', listify(val, self._mom))
        elif 'betas' in self.opt_keys:  self.set_val('betas', (listify(val, self._mom), self._beta))
        self._mom = listify(val, self._mom)

    @property
    def beta(self)->float:
        "get beta"
        return None if self._beta is None else self._beta[-1]

    @beta.setter
    def beta(self, val:float)->None:
        "Set beta (or alpha as makes sense for give optimizer)"
        if val is None: return
        if 'betas' in self.opt_keys:    self.set_val('betas', (self._mom, listify(val, self._beta)))
        elif 'alpha' in self.opt_keys:  self.set_val('alpha', listify(val, self._beta))
        self._beta = listify(val, self._beta)

    @property
    def wd(self)->float:
        "Get weight decay"
        return self._wd[-1]

    @wd.setter
    def wd(self, val:float)->None:
        "Set weight decay"
        if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)
        self._wd = listify(val, self._wd)

    #Helper functions
    def read_defaults(self)->None:
        "Read the values inside the optimizer for the hyper-parameters"
        self._beta = None
        if 'lr' in self.opt_keys: self._lr = self.read_val('lr')
        if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')
        if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')
        if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas')
        if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')

    def set_val(self, key:str, val:Any, bn_groups:bool=True)->Any:
        "Set the values inside the optimizer dictionary at the key"
        if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)]
        for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]):
            pg1[key] = v
            if bn_groups: pg2[key] = v
        return val

    def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]:
        "Read a hyper-parameter key in the optimizer dictionary."
        val = [pg[key] for pg in self.opt.param_groups[::2]]
        if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]
        return val


# work

In [43]:
trn_dl = LanguageModelLoader(data_trn_,bs=128,bptt=80)
tst_dl = LanguageModelLoader(data_tst_,bs=128,bptt=80)

(342834,)
(2678, 128)
(117308,)
(916, 128)


In [44]:
data = DataBunch(trn_dl,tst_dl)

In [45]:
model = Net()

In [46]:
index2vocab

{0: 2000,
 1: 2010,
 2: 2020,
 3: 2030,
 4: 3010,
 5: 3020,
 6: 3021,
 7: 3110,
 8: 3120,
 9: 3121,
 10: 4010,
 11: 4020,
 12: 4050,
 13: 4070,
 14: 4090,
 15: 9998,
 16: 9999}

In [47]:
emb_sz, nh, nl = 50, 120, 2
vocab_size = len(index2vocab)
model = get_language_model(vocab_size, emb_sz, nh, nl, 0, input_p=0.6, output_p=0.4, weight_p=0.5, 
                           embed_p=0.1, hidden_p=0.2)
learn = Learner(data, model)

In [48]:
bs,bptt = 20,10
learn.opt_fn = partial(optim.Adam, betas=(0.8,0.99))
learn.callbacks.append(RNNTrainer(learn, bptt, alpha=2, beta=1))
learn.callback_fns = [partial(GradientClipping, clip=0.12)]

In [49]:
fit_one_cycle(learn, 1, 5e-3, (0.8,0.7), wd=1.2e-6)

NameError: name 'fit' is not defined

In [125]:
d[0].shape,d[1].shape

(torch.Size([70, 128]), torch.Size([8960]))

In [126]:
o = model(d[0])

In [130]:
o.shape,d[1].shape

(torch.Size([70, 128, 17]), torch.Size([8960]))

In [132]:
l = CrossEntropyLoss()

In [135]:
d1 = d[1].view(70,-1)

In [137]:
l(o.view(-1,17),d[1])

tensor(2.7939, grad_fn=<NllLossBackward>)

In [88]:
def train_and_eval():
    #test_preds = np.zeros(len(test_x))
    #for fold, (train_idx, valid_idx) in enumerate(splits):
        #print('Fold:', fold)
    torch.cuda.empty_cache()
    learn = get_learner(train_idx, valid_idx)
    gc.collect()
    run = Runner(cb_funcs=cbfs)
    learn.model.train()
    run.fit(4, learn)
    learn.model.eval()
    test_preds_fold = np.zeros(len(test_dl.dataset))
    for i, (x_batch,) in enumerate(test_dl):
        with torch.no_grad():
            y_pred = learn.model(x_batch).detach()
        test_preds_fold[i*batch_size:(i+1)*batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    test_preds+=test_preds_fold/len(splits)
    del(learn)
    gc.collect()
    print(f'Test {fold} added')
    print('Training Completed')
    return test_preds

In [41]:
train_and_eval()

NameError: name 'get_learner' is not defined

In [None]:
def train_and_eval():
    test_preds = np.zeros(len(test_x))
    for fold, (train_idx, valid_idx) in enumerate(splits):
        print('Fold:', fold)
        torch.cuda.empty_cache()
        learn = get_learner(train_idx, valid_idx)
        gc.collect()
        run = Runner(cb_funcs=cbfs)
        learn.model.train()
        run.fit(4, learn)
        learn.model.eval()
        test_preds_fold = np.zeros(len(test_dl.dataset))
        for i, (x_batch,) in enumerate(test_dl):
            with torch.no_grad():
                y_pred = learn.model(x_batch).detach()
            test_preds_fold[i*batch_size:(i+1)*batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        test_preds+=test_preds_fold/len(splits)
        del(learn)
        gc.collect()
        print(f'Test {fold} added')
    print('Training Completed')
    return test_preds