# Neural model for classification

### Libraries import

In [None]:
import os
import pickle
from typing import Callable, Dict, List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from dotenv import load_dotenv

from tqdm.notebook import tqdm 
import matplotlib.pyplot as plt
from pathlib import Path
import random
import sklearn.preprocessing
import category_encoders as ce

import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.profiler import profile, record_function, ProfilerActivity

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utils.preprocessing import labelEncodeCats, remove_categories_not_in_both, remove_outliers, CATEGORICAL_TO_DROP, NUMERICAL_NON_COUNTERS, NUMERICAL_TO_DROP
from utils.normalized_cross_entropy_loss import normalized_cross_entropy_loss
from utils.preprocessing import (
    encode_counters,
    remove_categories_not_in_both,
    remove_outliers,
    trigonometric_date_encoding,
)

### Random seed

In [None]:
seed = 1234

np.random.seed(seed)
random.seed(seed)

### Dataset loading

In [None]:
TRAIN_VAL_DATA_PATH: Path = os.path.join('data/', 'train_val_Enc_Counters.parquet')
TEST_DATA_PATH: Path = os.path.join('data/', 'test_val_Enc_Counters.parquet')

df = pd.read_parquet(TRAIN_VAL_DATA_PATH).reset_index(drop=True)

df = df.astype({f"f_{i}": "category" for i in range(2, 33)})
df = df.astype({"f_1": "int"})

df = df.astype({"is_clicked": "int"})
df = df.astype({"is_installed": "int"})

df_test = pd.read_parquet(TEST_DATA_PATH).reset_index(drop=True)
df_test = df_test.astype({f"f_{i}": "category" for i in range(2, 33)})
df_test = df_test.astype({"f_1": "int"})

In [None]:
boolean_columns: List[str] = [f"f_{i}" for i in range(33, 42)]

# convert boolean columns to bool otherwise catboost will throw an error
for col in boolean_columns:
    df[col] = df[col].astype(bool)
    df_test[col] = df_test[col].astype(bool)

# union usless feaatures with backword selection
CATEGORICAL_TO_DROP: list = [
    "f_7",
    "f_9",
    "f_11",
    "f_23",
    "f_24",
    "f_25",
    "f_26",
    "f_27",
    "f_28",
    "f_29",
]

NUMERICAL_TO_DROP: list = [
    "f_55",
    "f_59",
    "f_64",
    "f_65",
    "f_66",
]

NUMERICAL_NON_COUNTERS: List[str] = [
    "f_43",
    "f_51",
    "f_58",
    "f_59",
    "f_64",
    "f_65",
    "f_66",
    "f_67",
    "f_68",
    "f_69",
    "f_70",
]

In [None]:
from utils.notebook_utils import collapse_binary

# if do you want to collapse the binary columns set collapse_binary to True
activate_collapse_binary = False

if activate_collapse_binary:
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)] + ["f_394041", "f_33457"]
else:
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)]

numerical_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
categorical_columns = [col for col in categorical_columns if col not in CATEGORICAL_TO_DROP]
numerical_columns = [
    col
    for col in numerical_columns
    if col not in NUMERICAL_TO_DROP and col in NUMERICAL_NON_COUNTERS
]
counter_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
counter_columns = [
    col
    for col in counter_columns
    if col not in NUMERICAL_TO_DROP and col not in NUMERICAL_NON_COUNTERS
]

### Other models predictions download

In [None]:
def download_s3_folder(bucket, s3_folder, local_dir=None):
    """
    Download the contents of a folder directory
    Args:
        bucket_name: the name of the s3 bucket
        s3_folder: the folder path in the s3 bucket
        local_dir: a relative or absolute directory path in the local file system
    """
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
            else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
            os.makedirs(os.path.dirname(target))
        if obj.key[-1] == '/':
            continue
        bucket.download_file(obj.key, target)

This model can both work as a network on the dataset or as a prediction aggregator (hybrid model)

In [None]:
intended_as_hybrid = True

In [None]:
import boto3
from sklearn.preprocessing import OneHotEncoder

load_dotenv()
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID", "default")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY", "default")
regiorn_name = "eu-west-1"
bucket_name = "challenge23"

s3 = boto3.resource(
    "s3",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    config=boto3.session.Config(region_name=regiorn_name),
)
bucket = s3.Bucket(bucket_name)

if intended_as_hybrid:

    download_s3_folder(bucket, "hybrid_predictions", ".")


    # get predictions as df
    catboost_cat_preds = pd.read_csv("incremental/catboost_categorical_trainval_incremental.csv", sep="\t")
    catboost_preds = pd.read_csv("incremental/catboost_trainval_incremental.csv", sep="\t")
    catboost_w_preds = pd.read_csv("incremental/catboost_weighted_trainval_incremental.csv", sep="\t")
    light_gbm_preds = pd.read_csv("incremental/light_trainval_incremental.csv", sep="\t")
    light_gbm_clicked_preds = pd.read_csv("incremental/light_trainval_is_clicked_incremental.csv", sep="\t")
    nn_click_preds = pd.read_csv("incremental/nn_clickpredictions_trainval_day.csv", sep="\t")
    nn_preds = pd.read_csv("incremental/nn_trainval_incremental.csv", sep="\t")
    xgb_catasnum_preds = pd.read_csv("incremental/xgb_catasnum_trainval_incremental.csv", sep="\t")
    xgb_num_preds = pd.read_csv("incremental/xgboost_numerical_trainval_incremental.csv", sep="\t")

    clustering_preds = pd.read_csv("incremental/kmeans_unsupervised_trainval.csv", sep="\t")
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_clustering = pd.DataFrame((one_hot_encoder.fit_transform(clustering_preds[["kmeans_unsupervised"]])).toarray())
    clustering_preds_new = pd.concat([clustering_preds, encoded_clustering], axis=1)
    clustering_preds_new = clustering_preds_new.drop(columns=['kmeans_unsupervised'])

    # merging preds
    final_dataset = df.merge(catboost_cat_preds, on="f_0")
    final_dataset = final_dataset.merge(catboost_preds, on="f_0")
    final_dataset = final_dataset.merge(catboost_w_preds, on="f_0")
    final_dataset = final_dataset.merge(light_gbm_preds, on="f_0")
    final_dataset = final_dataset.merge(light_gbm_clicked_preds, on="f_0")
    final_dataset = final_dataset.merge(nn_click_preds, on="f_0")
    final_dataset = final_dataset.merge(nn_preds, on="f_0")
    final_dataset = final_dataset.merge(xgb_catasnum_preds, on="f_0")
    final_dataset = final_dataset.merge(xgb_num_preds, on="f_0")
    final_dataset = final_dataset.merge(clustering_preds_new, on="f_0")
    final_dataset.columns = final_dataset.columns.astype(str)
else:
    final_dataset = df


### Preprocessing

In [None]:
def preprocess_data_nn(
    df_train: pd.DataFrame, df_val: pd.DataFrame, Y_train: pd.DataFrame, Y_val: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)]
    numerical_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
    boolean_columns: List[str] = [f"f_{i}" for i in range(33, 42)]
    categorical_columns = [col for col in categorical_columns if col not in CATEGORICAL_TO_DROP]
    numerical_columns = [col for col in numerical_columns if col not in NUMERICAL_TO_DROP and col in NUMERICAL_NON_COUNTERS]
    counter_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
    counter_columns = [col for col in counter_columns if col not in NUMERICAL_TO_DROP and col not in NUMERICAL_NON_COUNTERS]
    
    df_train = df_train.drop(columns=CATEGORICAL_TO_DROP + NUMERICAL_TO_DROP)
    df_val = df_val.drop(columns=CATEGORICAL_TO_DROP + NUMERICAL_TO_DROP)

    df_train = collapse_binary(df_train, dropOriginal=True)
    df_val = collapse_binary(df_val, dropOriginal=True)
    
    cb_encoder = ce.CatBoostEncoder()
    cb_encoder.fit(df_train[categorical_columns], Y_train)
    df_train[categorical_columns] = cb_encoder.transform(df_train[categorical_columns])
    df_val[categorical_columns] = cb_encoder.transform(df_val[categorical_columns])

    df_train, mins_train, steps_train = encode_counters(
        df=df_train,
        columns=counter_columns,
        mins=None,
        steps=None,
    )
    df_val, _, _ = encode_counters(
        df=df_val,
        columns=counter_columns,
        mins=mins_train,
        steps=steps_train,
    )
    counter_modes: pd.Series = df_train[counter_columns].mode()
    df_train = df_train.fillna(counter_modes)
    df_val = df_val.fillna(counter_modes)
    for col in counter_columns:
        n_zeros: int = (df_train[col] == 0).sum()
        if n_zeros > df_train.shape[0] * 0.95:
            df_train[col] = np.where(df_train[col].values, 1, 0)
            df_train = df_train.astype({col: "bool"})
            boolean_columns.append(col)
            df_val[col] = np.where(df_val[col].values, 1, 0)
            df_val = df_val.astype({col: "bool"})
        else:
            df_train[col] = np.log(df_train[col] + 0.5)
            df_val[col] = np.log(df_val[col] + 0.5)

    means: pd.Series = df_train[numerical_columns].mean()
    stds: pd.Series = df_train[numerical_columns].std()
    df_train = remove_outliers(
        df=df_train,
        columns=numerical_columns,
        coefficient=4,
        means=means,
        stds=stds,
    )
    df_val = remove_outliers(
        df=df_val,
        columns=numerical_columns,
        coefficient=4,
        means=means,
        stds=stds,
    )

    means_no_outliers: pd.Series = df_train[numerical_columns].mean()
    stds_no_outliers: pd.Series = df_train[numerical_columns].std()
    df_train.loc[:, numerical_columns] = (
        df_train.loc[:, numerical_columns] - means_no_outliers
    ) / stds_no_outliers
    df_val.loc[:, numerical_columns] = (
        df_val.loc[:, numerical_columns] - means_no_outliers
    ) / stds_no_outliers
    df_train = df_train.fillna(means_no_outliers)
    df_val = df_val.fillna(means_no_outliers)

    scaler = MinMaxScaler()
    df_train = scaler.fit_transform(df_train)
    df_val = scaler.transform(df_val)
    
    print("Preprocessing ended")
    
    return df_train, df_val

### Network structure

In [None]:
class Net(nn.Module):
    def __init__(self, layer_sizes, activation, dropout_rate, input_features):
        super(Net, self).__init__()
    
        
        self.layers = nn.ModuleList([nn.Linear(input_features, layer_sizes[0])])
        print("layer_sizes ", layer_sizes)
        for i in range(len(layer_sizes)-1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
        self.activation = activation
        self.dropout = nn.Dropout(dropout_rate)
        self.output = nn.Linear(layer_sizes[-1], 1)  # Output layer with size 1
    
  
    def forward(self, x):
        for layer in self.layers:  # Not applying activation to last layer
            x = self.activation(layer(x))
            x = self.dropout(x)
        x = self.output(x)
        return x

In [None]:
def train(model, X_train, y_train, X_val, y_val, optimizer, criterion, device, trial, num_epochs, batch_size, patience):
    
    model.train()
    
    best_val_loss = np.inf
    patience_counter = 0
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch} started")
        losses = []
        batch_counter = 0
        # Calculate the number of batches
        num_batches = int(len(X_train) / batch_size)

        permutation = torch.randperm(X_train.size()[0])
        for i in range(0, X_train.size()[0], batch_size):
            optimizer.zero_grad()

            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train[indices], y_train[indices]

            outputs = torch.sigmoid(model(batch_x))
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())

        # Store the losses in the trial
        #trial.set_user_attr('losses', losses)
        
        # Calculate validation loss
        val_loss = evaluate(model, X_val, y_val, criterion, device)
        
        print(f"Loss epoch {epoch}: {val_loss}")
        
        # Check if we need to save the model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')  # Save the model
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping due to lack of improvement in validation loss.")
                break
                
        print(f"Epoch {epoch} finished")

    return best_val_loss

In [None]:
def evaluate(model, X_val, y_val, criterion, device):
    model.eval()
    total_loss = 0
    total_samples = 0
    with torch.no_grad():
        outputs = torch.sigmoid(model(X_val))  # Apply sigmoid to outputs
        loss = criterion(outputs, y_val)
        total_loss += loss.item() * X_val.size(0)
        total_samples += X_val.size(0)
    return total_loss / total_samples

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]

        return x, y

    def __len__(self):
        return len(self.data)

### Objective function to tune

In [None]:
def objective(trial):
    # Define the search space for hyperparameters
    num_layers = trial.suggest_int('num_layers', 1, 7)
    layer_sizes = []
    for i in range(num_layers):
        layer_sizes.append(trial.suggest_int(f'hidden_size_{i}', 16, 256, log=True))
    activation_function = trial.suggest_categorical('activation_function', ['relu', 'swish', 'elu'])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.3)
    batch_size = trial.suggest_categorical('batch_size',[32, 64, 128, 256])

    activation_functions = {
    'relu': torch.nn.functional.relu,
    'elu': torch.nn.functional.elu,
    # swish is not a standard function in torch.nn.functional, so we define it
    'swish': lambda x: x * torch.sigmoid(x)  
    }
    
    # Create the model and move it to the GPU if available
    activation = activation_functions[activation_function]
    
    # Train the model

    val_day = 65
        
    train_df = final_dataset[final_dataset["f_1"] < val_day]
    val_df = final_dataset[final_dataset["f_1"] >= val_day]

    X_train = train_df.drop(columns=["is_clicked", "is_installed"])
    y_train = train_df[["is_installed"]]
    X_val = val_df.drop(columns=["is_clicked", "is_installed"])
    y_val = val_df[["is_installed"]]

    X_train, X_val = preprocess_data_nn(X_train, X_val, y_train, y_val)

    y_train = y_train.to_numpy()
    y_val = y_val.to_numpy()

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32)

    X_train, y_train = X_train.to(device), y_train.to(device)
    X_val, y_val = X_val.to(device), y_val.to(device)

    input_features = X_train.shape[1]
    model = Net(layer_sizes, activation, dropout_rate, input_features).to(device)

    # Define the loss function, optimizer, and data loaders
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    print("Num_epochs: ", num_epochs)
    loss = 0
    
    loss = train(model, X_train, y_train, X_val, y_val, optimizer, criterion, device, trial, num_epochs, batch_size, patience=5)

    # Evaluate the model on the validation set
    return loss

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 100

In [None]:
def optuna_callback(study, trial):
    # Access the trial info and study
    print(f"Trial number: {trial.number}")
    print(f"Trial value (loss): {trial.value}")
    print(f"Trial parameters: {trial.params}")
    print(f"Best value so far: {study.best_value}")
    print(f"Best trial so far: {study.best_trial}")

### Tuning

In [None]:
OPTUNA_STORAGE: str = os.getenv("OPTUNA_STORAGE", "sqlite://optuna.db")
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=777), study_name="Hivemind", storage=OPTUNA_STORAGE)
study.optimize(objective, n_trials=200)

In [None]:
trial = study.best_trial
print('Best loss: ', trial.value)
print('Best parameters: ')
for key, value in trial.params.items():
    print(f"  {key}: {value}")