In [None]:
import gc
import pandas as pd
import numpy as np
import sklearn as skl
import seaborn as sns
import lightgbm as lgb
import torch
import scipy
import opendatasets as od
import math
import torchmetrics
import utils.lgbm as lgbm
import importlib
import joblib
import category_encoders
import utils.mlp as mlp
import utils.embedding_pipeline as embedding_pipeline
import utils.early_stopping as early_stopping
import utils.mlp_pipeline as mlp_pipeline
import pytorch_tabular



In [None]:
import pickle
import io

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


In [None]:
#Kaggle api key: 9c2fc93aaaf8815d9fa8f2ceeeb57b6b
od.download("https://www.kaggle.com/c/avazu-ctr-prediction/")

ctr_data=pd.read_csv("./avazu-ctr-prediction/train.gz",nrows=1000000) #Using only the first 1million rows


In [None]:
def find_optimal_recall(fpr,tpr,threshold,rate):
    optimal_index=np.argmin(abs(fpr-rate*np.ones(len(fpr))))
    print(threshold[optimal_index])
    return tpr[optimal_index]

In [None]:
X=ctr_data.loc[:,'C1':'C21']
Y=ctr_data.click

del ctr_data

    
training_range=range(0,math.floor(len(Y)*0.6))
validation_range=range(math.floor(len(Y)*0.6),math.floor(len(Y)*0.8))
testing_range=range(math.floor(len(Y)*0.8),math.floor(len(Y)*1))

#Establish continuous and categorical features
cat_cols=X.columns
cont_cols=X.columns.difference(cat_cols)


xtrain=X.loc[training_range]
ytrain=Y.loc[training_range]
xtest=X.loc[testing_range]
ytest=Y.loc[testing_range]
xval=X.loc[validation_range]
yval=Y.loc[validation_range]

x_train,x_test,x_val=embedding_pipeline.aggregate_low_card_BAF(xtrain,xtest,xval,cat_cols)

del X
del Y
gc.collect()


26

# Parameter Tuning

In [None]:
import lgbm
aux=importlib.reload(lgbm)
param_list=lgbm.lgbm_param_sampler(20,7,'cpu')
print(len(param_list))
print(param_list[0])
i=0
for params in param_list:
    model = lgbm.LGBMClassifier(n_jobs=10, **params)  # Instantiate LGBM Model.
        
    # Fit model to training data.
    model.fit(xtrain, ytrain, categorical_feature=categorical_features,eval_set=[(xval,yval)],early_stopping_rounds=5,eval_metric='loss')
    # Obtain predictions in test data.
    predictions = model.predict_proba(xtest)[:, 1]

    joblib.dump(model,'lgbm_3{}.pkl'.format(i))

In [None]:
#Usar apenas para models mlp com label encoding
x1=xtrain
x2=xtest
for col in xtrain.select_dtypes(object).columns:
    
    le=skl.preprocessing.LabelEncoder()
    xtrain[col]=le.fit_transform(xtrain[col])
    xtest[col] = xtest[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    xtest[col]=le.transform(xtest[col])
for col in xtrain.select_dtypes(float).columns:
    xtrain[col]=xtrain[col].fillna(np.average(xtrain[col]))
    xtest[col]=xtest[col].fillna(np.average(xtest[col]))

xtrain=xtrain.fillna(-1)
xtest=xtest.fillna(-1)

# Categorical Encodings

In [None]:
cat_cols=xtrain.columns
cont_cols=xtrain.columns.difference(cat_cols)

target_encoder=category_encoders.target_encoder.TargetEncoder(cols=cat_cols,handle_unknown='value',handle_missing='value')
xtrain_encoded=target_encoder.fit_transform(xtrain,ytrain)
xtest_encoded=target_encoder.transform(xtest)

# Categorical Embeddings

In [None]:
from sklearn.model_selection import ParameterSampler
from scipy.stats import uniform

embedding_pipeline=importlib.reload(embedding_pipeline)
param_list=mlp.mlp_param_sampler(20, len(xtrain.columns),7,device)

#This dataset has no continuous features
normalization='None'
xtrain_aux_cont=torch.tensor([])
xval_aux_cont=torch.tensor([])

xtrain_aux_cat=mlp_pipeline.normalization_transform(x_train,'None',[])
xval_aux_cat=mlp_pipeline.normalization_transform(x_val,'None',[])
method='variational'


i=0
alphas=list(ParameterSampler({'alpha': uniform(loc=1.5, scale=3-1.5 )}, n_iter=20, random_state=np.random.RandomState(7)))

for params in param_list[i:]:
        dims=embedding_pipeline.get_emb_dim_var(x_train,cat_cols,alphas[i]['alpha'])
        dims=torch.tensor(dims,dtype=int).to(device)
        print(dims)
        # Fit pipeline
        print('iteration: ',i)
        
        model=embedding_pipeline.pipeline_no_cat(device,xtrain_aux_cat,xtrain_aux_cont,xval_aux_cat,xval_aux_cont,ytrain,yval,params,method,dims)

        #Save the model
        joblib.dump(model,'/content/gdrive/MyDrive/Colab Notebooks/data/mlp_3_emb/mlp_simple{}{}.pkl'.format(method,i))

        i=i+1

gc.collect()

In [None]:
import tabtransformer
tabtransformer=importlib.reload(tabtransformer)

cat_cols=['C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id',
       'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model',
       'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21']

params=tabtransformer.tabtransformer_param_sampler(20,7,device)

for i in range(0,20):
  param=params[i]
  model=tabtransformer.transformer_pipeline(param,cat_cols,[],['click'],xtrain,xval,ytrain,yval,'gpu')
  joblib.dump(model,'/content/gdrive/MyDrive/aux/CTR_tabtransformer/tabtransformerCTR_{}'.format(i))
  predict_val=model.predict(xval)['1_probability']
  predict_test=model.predict(xtest)['1_probability']
  joblib.dump(predict_val,'/content/gdrive/MyDrive/aux/CTR_tabtransformer/yval_{}'.format(i))

  joblib.dump(predict_test,'/content/gdrive/MyDrive/aux/CTR_tabtransformer/yhat_{}'.format(i))

