In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import math

In [None]:
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def create_embeddings(cats, df):
    cat_info = {}
    for cat in cats:
        cat_info[cat] = df[cat].nunique()-1

    emb_dims = {}
    for key in cat_info.keys():
        cardin = df[key].nunique()
        layer_tensor = torch.tensor([cardin,50 if cardin>50 else cardin//2]) # "The rule of thumb for determining the embedding size is the cardinality size divided by 2, but no bigger than 50." https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608/4
        emb_dims[key] = layer_tensor

    emb_layers = nn.ModuleList(nn.Embedding(x,y) for x,y in emb_dims.values())
    summ = sum(y for x,y in emb_dims.values())
    return emb_layers,summ


def cat_transform(cats,df):
    for cat in cats:
        for idx,label in zip(range(df[cat].nunique()),df[cat].unique()):
            df.loc[df[cat]==label,cat] = idx
        df[cat] = df[cat].astype('int')
    df = reduce_mem_usage(df)
    return df


def create_dataloaders(df, target, bs, valid_idx):
    train = df.loc[~df.index.isin(valid_idx),:]
    x_train = train.loc[:,train.columns!=target]
    y_train = train.loc[:,target]
    
    valid = df.loc[df.index.isin(valid_idx),:]
    x_valid = valid.loc[:,valid.columns!=target]
    y_valid = valid.loc[:,target]
    
    x_train, y_train = torch.from_numpy(x_train.values), torch.from_numpy(y_train.values)
    x_valid, y_valid = torch.from_numpy(x_valid.values), torch.from_numpy(y_valid.values)
    
    train_ds,valid_ds = Dataset(x_train,y_train),Dataset(x_valid,y_valid)
    train_dl,valid_dl = Dataloader(train_ds,bs),Dataloader(valid_ds,bs)
    return train_dl,valid_dl
    
    
class Dataset():
    def __init__(self,x,y):
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self,i):
        return self.x[i],self.y[i]
    
    
class Dataloader():
    def __init__(self,ds,bs):
        self.ds = ds
        self.bs = bs
    def __iter__(self):
        for batch in range(0,len(self.ds),self.bs):
            yield self.ds[batch:batch+self.bs]
    def __len__(self):
        len_dl = len(self.ds)//self.bs
        len_dl += 1 if len(self.ds)%self.bs != 0 else 0
        return len_dl
    


# def accuracy(preds,targs): return (preds.round().int()==targs).float().mean()
def accuracy(preds,targs): return (preds.argmax(dim=-1)==targs).float().mean()

def fit(epochs, loss_func, acc_metric):
    for epoch in range(epochs):
        for x_batch,y_batch in train_dl:
            preds = model(x_batch.float())
            loss = loss_func(preds, y_batch.long())
            loss.backward()
            opt.step()
            opt.zero_grad()
        with torch.no_grad():
            tot_loss, tot_acc = 0,0
            for x_b,y_b in valid_dl:
                predicts = model(x_b.float())
                tot_loss += loss_func(predicts,y_b.long())
                tot_acc += acc_metric(predicts,y_b)
            print(f'Epoch {epoch} -> loss: {tot_loss/len(valid_dl):.13f}  accuracy: {tot_acc/len(valid_dl):.13f}')

In [None]:
class Config():
    def __init__(self, cols):
        self.lr = 0.1
        inp_layers = cols
        hid_layers = 2
        out_layers = 2
        bs = 64
        targ = 'Survived'

In [None]:
class tabular_model(nn.Module):
    def __init__(self, df, cats, targs, config):
        self.config = config
        self.embeds,self.summ = create_embeddings(cats, df)
        self.model = model(self.embeds, config)
        self.opt = opt(self.model.parameters(), lr=lr)
        self.cats_len = cats
        self.conts_len = len(df.columns) - len(self.cats) - 1 # - 1 for target column
        
    def forward(self, cats, conts):
        if self.cats != 0:
            x = [emb(cats[:,idx]) for idx,emb in enumerate(self.embeds)]
            x = torch.cat(x, 1)
        if self.conts != 0:
            x = torch.cat([x,conts], 1) if self.num_embeds != 0 else conts
        return self.layers(x)
    
    def model(embs, inp,nh,outp,lr):
        return nn.Sequential(embs, nn.Linear(inp,nh),nn.ReLU(),nn.Linear(nh,outp))

In [None]:
test = reduce_mem_usage(df)

In [None]:
test

In [None]:
df.info()

In [None]:
df = pd.read_csv('../input/titanic/train.csv').drop(['Name','Ticket'],axis=1)

In [None]:
df = pd.read_csv('../input/titanic/train.csv').drop(['Name','Ticket'],axis=1)
df.loc[df['Age'].isna(),'Age'] = round(df['Age'].mean()); df

len_feats = df.columns.size
config = Config(feats)

valid_idx = random.sample(list(df.index), int(0.33*len(df)))
categories = ['PassengerId','Survived','Sex','Pclass','Cabin','Embarked']
df = cat_transform(categories,df)
emb_layers,summ = create_embeddings(categories,df)
train_dl, valid_dl = create_dataloaders(df, target, batch_size, valid_idx)
rows,cols = df.shape

loss = F.cross_entropy
model,opt = a_model(emb_layers,inp=cols,nh=2,outp=2,lr=0.1)
fit(10, loss, accuracy)

In [None]:
model

In [None]:
nn.Sequential(emb_layers,nn.Linear(10,2),nn.ReLU(),nn.Linear(2,2))

In [None]:
emb_layers

# FastAI version

In [None]:
from fastai.tabular.all import *

def accuracy(preds,targs):
    return (preds.round().int()==targs).float().mean()

valid_idx = list(df.iloc[round((len(df)-len(df)*0.2)):,:].index)
cat_names = ['Survived','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
cont_names = list(df.loc[:,[item not in cat_names for item in list(df.columns)]].columns)
procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_df(df, procs=procs, cat_names=cat_names, cont_names=cont_names, 
                                 y_names='Survived', valid_idx=valid_idx, bs=32)
learn = tabular_learner(dls, metrics=[accuracy])
learn.lr_find()

In [None]:
learn.fit_one_cycle(10,lr_max=0.006)

In [None]:
from fastai.tabular.all import *

valid_idx = list(df.iloc[round((len(df)-len(df)*0.2)):,:].index)
cat_names = ['Class']
cont_names = list(df.iloc[:,:-1].columns)
procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_df(df, procs=procs, cat_names=cat_names, cont_names=cont_names, 
                                 y_names="Class", valid_idx=valid_idx, bs=64)
learn = tabular_learner(dls, y_range=torch.tensor([1,2]), metrics=[accuracy])
learn.lr_find()

In [None]:
learn.show_training_loop()

In [None]:
preds,targs = learn.get_preds()
preds,targs

In [None]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')

train_idx = random.sample(list(df.index),round(len(df)*.8))
train_idx.sort()
x_train = df.iloc[train_idx,:-1]
y_train = df.iloc[train_idx,-1]
x_valid = df.iloc[~df.index.isin(train_idx),:-1]
y_valid = df.iloc[~df.index.isin(train_idx),-1]
x_row,x_col = x_train.shape

pos = df[df['Class']==1]['Class'].count()

print(f'Number of positives: {pos}')
print(f'Percentage of training set: {round(pos/len(y_train)*100,5)}%')