In [None]:
# Basic
import numpy as np
import pandas as pd
import torch
from pathlib import Path

# Models
from fastai.tabular.all import *
from fastai.callback.fp16 import *
from fastai.metrics import RocAuc
from sklearn.metrics import f1_score, precision_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Resampling
from imblearn.over_sampling import RandomOverSampler

print("GPU Available:", torch.cuda.is_available())

In [None]:
! tree -d /kaggle
!ls -sh /kaggle/input/analytics-olympiad-2023

In [None]:
%%time
### Set the Path of dataset for easy retrieval ###
path = Path("/kaggle/input/analytics-olympiad-2023")
train = pd.read_csv(path/"train.csv")
test = pd.read_csv(path/"test.csv")

In [None]:
# Drop cols
drop_cols = ["customer_id","firstname","lastname"]
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

### Checkout the shape of the dataset ###
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
### Caolumns that are present in TRAIN SET but not in TEST SET ###
print("Columns that are present in TRAIN SET but not in TEST SET\n")

for col in list(train.columns):
    # Print all the columns in train set that are not in test set
    if col not in list(test.columns):
        print(col)

# Experiment 1: Simple Base Model
##  `primary_close_flag` as target

In [None]:
%%time
train_primary = train.copy(deep=True)
train_primary = train.drop(["final_close_flag"], axis=1)
test = test.copy(deep=True)

target_name = "primary_close_flag"
train_primary.shape

### Oversampling

In [None]:
%%time
from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler

X = train_primary.drop(target_name, axis=1)
y = train_primary[[target_name]]

# Assuming you have your training data in `dls.train_ds` and the labels in `dls.train_ds`
ros = RandomOverSampler(random_state=0)
# rus = RandomUnderSampler(random_state=0)

# Resample the training data to balance the classes
X_resampled, y_resampled = ros.fit_resample(X, y)
train_primary_ros = pd.concat([X_resampled, y_resampled], axis=1)
# X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

print(train_primary_ros[target_name].value_counts(normalize=True))
print(train_primary_ros.shape)

In [None]:
cat_names = []
cont_names = []

for col in train_primary_ros.columns:
    if train_primary_ros[col].nunique() <= 10:
        cat_names.append(col)
    else:
        cont_names.append(col)
        
len(cat_names), len(cont_names)

In [None]:
if (target_name in cat_names):
    cat_names.remove(target_name)
    print("Removed from cat_names")

if (target_name in cont_names):
    cont_names.remove(target_name)
    print("Removed from cont_names")

In [None]:
train_primary_ros[cat_names].info()

In [None]:
train_primary_ros[cont_names].info()

In [None]:
# Train set 80% & Validation set 20%
splits = RandomSplitter(valid_pct=0.2)(range_of(train_primary_ros))

# Preprocessing Techniques
data_preprocessing = [FillMissing, Categorify, Normalize]

# TabularPandas object for preprocessing
to = TabularPandas(train_primary_ros, 
                   procs=data_preprocessing, 
                   cat_names=cat_names,
                   cont_names=cont_names, 
                   y_names=target_name, 
                   y_block = CategoryBlock(),
                   splits=splits)

# Create DataLoaders for training and validation
dls = to.dataloaders(bs=1024, shuffle=True)
dls.show_batch()

In [None]:
dls.train.xs.shape, dls.train.ys.shape

In [None]:
dls.valid.xs.shape, dls.valid.ys.shape

### Train Model

In [None]:
from fastai.callback.tracker import SaveModelCallback

learn = tabular_learner(dls, 
                        layers=[500, 250],
                        metrics=[accuracy, RocAucBinary()],
                        wd=0.1,  # Adjust weight decay value as needed
                        cbs=[SaveModelCallback(monitor='roc_auc_score', fname='best_model')])  # Save the best model based on ROC AUC


learn.lr_find()

In [None]:
learn.fit_one_cycle(n_epoch=10, lr_max=slice(5e-4,5e-3))

### Inference

In [None]:
test_dls = learn.dls.test_dl(test)

In [None]:
predicted_probabilites, target = learn.get_preds(dl=test_dls)

predicted_class = predicted_probabilites.argmax(dim=1)

print(predicted_class[:20])

In [None]:
submission = pd.DataFrame(data = predicted_class, columns = ["primary_close_flag"])
submission 

## `final_close_flag` as Target

In [None]:
%%time
train_final = train.copy(deep=True)
train_final = train.drop(["primary_close_flag"], axis=1)
test = test.copy(deep=True)

target_name = "final_close_flag"
train_final.shape

## Oversampling

In [None]:
%%time
from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler

X = train_final.drop(target_name, axis=1)
y = train_final[[target_name]]

# Assuming you have your training data in `dls.train_ds` and the labels in `dls.train_ds`
ros = RandomOverSampler(random_state=0)
# rus = RandomUnderSampler(random_state=0)

# Resample the training data to balance the classes
X_resampled, y_resampled = ros.fit_resample(X, y)

train_final_ros = pd.concat([X_resampled, y_resampled], axis=1)
# X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

print(train_final_ros[target_name].value_counts(normalize=True))
print(train_final_ros.shape)

In [None]:
cat_names = []
cont_names = []

for col in train_final_ros.columns:
    if train_final_ros[col].nunique() <= 10:
        cat_names.append(col)
    else:
        cont_names.append(col)
        
len(cat_names), len(cont_names)

In [None]:
if (target_name in cat_names):
    cat_names.remove(target_name)
    print("Removed from cat_names")

if (target_name in cont_names):
    cont_names.remove(target_name)
    print("Removed from cont_names")

In [None]:
# Train set 80% & Validation set 20%
splits = RandomSplitter(valid_pct=0.2)(range_of(train_final_ros))

# Preprocessing Techniques
data_preprocessing = [FillMissing, Categorify, Normalize]

# TabularPandas object for preprocessing
to = TabularPandas(train_final_ros, 
                   procs=data_preprocessing, 
                   cat_names=cat_names,
                   cont_names=cont_names, 
                   y_names=target_name, 
                   y_block = CategoryBlock(),
                   splits=splits)

# Create DataLoaders for training and validation
dls = to.dataloaders(bs=1024, shuffle=True)
dls.show_batch()

In [None]:
dls.train.xs.shape, dls.train.ys.shape

In [None]:
dls.valid.xs.shape, dls.valid.ys.shape

## Train Model

In [None]:
from fastai.callback.tracker import SaveModelCallback

learn = tabular_learner(dls, 
                        layers=[500, 250],
                        metrics=[accuracy, RocAucBinary()],
                        wd=0.1,  # Adjust weight decay value as needed
                        cbs=[SaveModelCallback(monitor='roc_auc_score', fname='best_model')])  # Save the best model based on ROC AUC


learn.lr_find()

In [None]:
%%time
learn.fit_one_cycle(n_epoch=10, lr_max=slice(1e-4,1e-2))

In [None]:
learn.summary()

### Inference

In [None]:
test_dls = learn.dls.test_dl(test)

In [None]:
predicted_probabilites, target = learn.get_preds(dl=test_dls)

predicted_class = predicted_probabilites.argmax(dim=1)

print(predicted_class[:20])

In [None]:
submission["final_close_flag"] = predicted_class
submission

### Export for Submission

In [None]:
submission.to_csv("my_submission3.csv",index=False)

---