In [None]:
# Basic
import numpy as np
import pandas as pd
import torch
from pathlib import Path

# Models
from fastai.tabular.all import *
from fastai.callback.fp16 import *
from fastai.metrics import RocAuc
from sklearn.metrics import f1_score, precision_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Resampling
from imblearn.over_sampling import RandomOverSampler

print("GPU Available:", torch.cuda.is_available())

In [None]:
! tree -d /kaggle
!ls -sh /kaggle/input/analytics-olympiad-2023

In [None]:
### Set the Path of dataset for easy retrieval ###
path = Path("/kaggle/input/analytics-olympiad-2023")
train = pd.read_csv(path/"train.csv")
test = pd.read_csv(path/"test.csv")

### Checkout the shape of the dataset ###
print(train.shape)
print(test.shape)

In [None]:
### Display train set ###
train.head(2)

In [None]:
### Display test set ###
test.head(2)

In [None]:
### Columns that are present in TRAIN SET but not in TEST SET ###
print("Columns that are present in TRAIN SET but not in TEST SET\n")

for col in list(train.columns):
    # Print all the columns in train set that are not in test set
    if col not in list(test.columns):
        print(col)

In [None]:
### Extract Categorical and Numerical Column names
def extract_cat_num_cols(df):  
    categorical_columns = list()
    numerical_columns = list()

    for col in list(df.columns):
        if int(train[col].nunique()) < 10:
            categorical_columns.append(col)
        else:
            numerical_columns.append(col)
            
    return categorical_columns,numerical_columns

print("Done!!!")

# Experiment 1
* `primary_close_flag` as target

## Dropping a single column

In [None]:
# Dropping "final_close_flag" because "primary_close_flag" is Target 
experiment1_drop_cols = ["final_close_flag", "customer_id", "firstname", "lastname"]
train_val_df = train.drop(experiment1_drop_cols, axis=1)
test_df = test
target_name = "primary_close_flag"

In [None]:
### Extract Categorical & Numerical Columns
cat_names, cont_names = extract_cat_num_cols(train_val_df)  

print(f"\nCategorical Columns --- {len(cat_names)} --- \n{cat_names[:5]} ...& more")
print(f"\nNumerical Columns --- {len(cont_names)} --- \n{cont_names[:5]} ...& more")

In [None]:
# inplace 
cat_names.remove(target_name)
cat_names = cat_names

In [None]:
### Check the continuous column datatype, they must all be numerical
print("Continuous Columns must have numerical Dtype")
for col in cont_names:
    print(col,"---|||", train_val_df[col].dtype)

In [None]:
len(cat_names) + len(cont_names)

In [None]:
# Perform random oversampling on the training data
X_resampled, y_resampled = RandomOverSampler().fit_resample(train_val_df[cat_names + cont_names], train_val_df[target_name])

# Combine the resampled data for creating TabularPandas
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
print(resampled_df[target_name].value_counts(normalize=True))

# Train set 80% & Validation set 20%
splits = RandomSplitter(valid_pct=0.2)(range_of(resampled_df))

# Preprocessing Techniques
data_preprocessing = [FillMissing, Categorify, Normalize]

# TabularPandas object for preprocessing
to = TabularPandas(resampled_df, 
                   procs=data_preprocessing, 
                   cat_names=cat_names,
                   cont_names=cont_names, 
                   y_names=target_name, 
                   splits=splits)

# Extract the preprocessed training and validation features and targets
X_train = to.train.xs
y_train = to.train.ys

X_val = to.valid.xs
y_val = to.valid.ys

print(X_train.shape[0] + X_val.shape[0])

# Combine the preprocessed training features and target
preprocessed_train_df = pd.concat([X_train, y_train[target_name]], axis=1)

# Combine the preprocessed validation features and target
preprocessed_valid_df = pd.concat([X_val, y_val[target_name]], axis=1)

# Concatenate the preprocessed training and validation dataframes
preprocessed_df = pd.concat([preprocessed_train_df, preprocessed_valid_df], ignore_index=True)

###########################################################################

In [None]:
# Create a TabularPandas object for test data
to_test = TabularPandas(test_df, 
                        procs=data_preprocessing, 
                        cat_names=cat_names,
                        cont_names=cont_names,
                        splits=None)

preprocessed_test_df = to_test.xs

## Two DataLoaders

This way is we can pass in different `batch sizes` into each TabDataLoader, along with changing options like `shuffle` and `drop_last`.

In [None]:
# Create DataLoaders for training and validation
dls = to.dataloaders(bs=1024, shuffle=True)
test_dls = to_test.dataloaders(bs=1024, shuffle=False)

dls.show_batch()

# TabularLearner

## Categorical Values

When dealing with our categorical data, we create what is called an **embedding matrix**. This allows for a higher dimentionality for relationships between the different categorical cardinalities. Finding the best size ratio was done through experiments by Jeremy on the Rossmann dataset

In [None]:
to.cat_names

We either choose a maximum size of 600, or 1.6 * the cardinality to the .56

In [None]:
to["loans_next_payment_summary"].nunique()

If you notice, we had 8 in(8,10) there, this is to take one more column for any missing categorical values that may show

## TabularLearner
And now we can build our model!

With the first being our categorical variables and the second being our numericals.

Now let's make our model. We'll want our size of our embeddings, the number of continuous variables, the number of outputs, and how large and how many fully connected layers we want to use:

In [None]:
## Tabular Learner (Model)
learn = tabular_learner(dls, 
                        metrics=[accuracy])

In [None]:
learn.summary()

Now that we know the background, let's do that a bit quicker:

In [None]:
learn.lr_find()

In [None]:
# # Implement EarlyStoppingCallback
# early_stop = EarlyStoppingCallback(monitor='valid_loss', 
#                                    min_delta=0.001, 
#                                    patience=3)


### Train the model using one-cycle policy
learn.fit_one_cycle(n_epoch=10, lr_max=slice(1e-4,5e-3))

In [None]:
### Validation and Evaluation

# Get predictions and true labels for validation set
preds_proba, targets = learn.get_preds()

 # Convert predictions to binary values
preds = (preds_proba[:, 0] > 0.5).to(torch.int)

# Calculate F1 score
f1_val = f1_score(targets, preds)

# Calculate Precision
precision_val = precision_score(targets, preds)

# Calculate Accuracy
accuracy_val = accuracy_score(targets, preds)

# Calculate ROC AUC score
roc_auc_val = roc_auc_score(targets, preds_proba[:, 0])  # Using probabilities for positive class

# Print the results
print(f"Validation Accuracy: {accuracy_val:.4f}")
print(f"Validation ROC AUC: {roc_auc_val:.4f}")
print(f"Validation F1 Score: {f1_val:.4f}")
print(f"Validation Precision: {precision_val:.4f}")

In [None]:
test_df.drop(["customer_id","firstname","lastname"], axis = 1, inplace=True)

In [None]:
test_df.columns

In [None]:
# Export the model for inference
learn.export('learner1')

# Load the model for inference
learn = load_learner('learner1')

# Assuming test_df contains your test data
test_dl = learn.dls.test_dl(test_df)

In [None]:
learn.get_preds(dl=test_dl)
# Get the predicted class labels
pred_labels = preds.argmax(dim=-1)

# Display the predicted class labels
print(pred_labels)

In [None]:
test_dls[0]

In [None]:
preds_proba, _ = learn.get_preds(dl=test_dl)

preds = (preds_proba[:, 0] > 0.5).to(torch.int)

test["primary_close_flag"] = preds
test["primary_close_flag"].head()

# Experiment 2
* `final_close_flag` as target

In [None]:
### We will use all the columns in train_set that are in test_set
train_val_df = train.drop(["primary_close_flag","customer_id","firstname","lastname"], axis=1)

# test_df = test.drop(["customer_id",
#                      "firstname",
#                      "lastname",], axis=1)

test_df = test

target_name = "final_close_flag"

### Extract Categorical & Numerical Columns
cat_names, cont_names = extract_cat_num_cols(train_val_df)  

print(f"\nCategorical Columns --- {len(cat_names)} --- \n{cat_names[:5]} ...& more")
print(f"\nNumerical Columns --- {len(cont_names)} --- \n{cont_names[:5]} ...& more")

# inplace 
cat_names.remove(target_name)
cat_names = cat_names

### Check the continuous column datatype, they must all be numerical
for col in cont_names:
    print(col,"---|||", train_val_df[col].dtype)

In [None]:
# Perform random oversampling on the training data
X_resampled, y_resampled = RandomOverSampler().fit_resample(train_val_df[cat_names + cont_names], train_val_df[target_name])

# Combine the resampled data for creating TabularPandas
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
print(resampled_df[target_name].value_counts(normalize=True))

# Train set 80% & Validation set 20%
splits = RandomSplitter(valid_pct=0.2)(range_of(resampled_df))

# Preprocessing Techniques
data_preprocessing = [FillMissing, Categorify, Normalize]

# TabularPandas object for preprocessing
to = TabularPandas(resampled_df, 
                   procs=data_preprocessing, 
                   cat_names=cat_names,
                   cont_names=cont_names, 
                   y_names=target_name, 
                   splits=splits)

# Extract the preprocessed training and validation features and targets
X_train = to.train.xs
y_train = to.train.ys

X_val = to.valid.xs
y_val = to.valid.ys


# Combine the preprocessed training features and target
preprocessed_train_df = pd.concat([X_train, y_train[target_name]], axis=1)

# Combine the preprocessed validation features and target
preprocessed_valid_df = pd.concat([X_val, y_val[target_name]], axis=1)

# Concatenate the preprocessed training and validation dataframes
preprocessed_df = pd.concat([preprocessed_train_df, preprocessed_valid_df], ignore_index=True)

###########################################################################

# Create a TabularPandas object for test data
to_test = TabularPandas(test_df, 
                        procs=data_preprocessing, 
                        cat_names=cat_names,
                        cont_names=cont_names,
                        splits=None)

preprocessed_test_df = to_test.xs

In [None]:
dls.show_batch()

In [None]:
### Tabular Learner (Model)
learn = tabular_learner(dls, 
                        metrics=[accuracy,RocAuc()],
                        wd=0.01)

learn.summary()

In [None]:
learn.lr_find()

In [None]:
### Train the model using one-cycle policy
learn.fit_one_cycle(n_epoch=10, lr_max=slice(1e-3,1e-2))

In [None]:
### Validation and Evaluation

# Get predictions and true labels for validation set
preds_proba, targets = learn.get_preds()

 # Convert predictions to binary values
preds = (preds_proba[:, 0] > 0.5).to(torch.int)

# Calculate F1 score
f1_val = f1_score(targets, preds)

# Calculate Precision
precision_val = precision_score(targets, preds)

# Calculate Accuracy
accuracy_val = accuracy_score(targets, preds)

# Calculate ROC AUC score
roc_auc_val = roc_auc_score(targets, preds_proba[:, 0])  # Using probabilities for positive class

# Print the results
print(f"Validation Accuracy: {accuracy_val:.4f}")
print(f"Validation ROC AUC: {roc_auc_val:.4f}")
print(f"Validation F1 Score: {f1_val:.4f}")
print(f"Validation Precision: {precision_val:.4f}")

In [None]:
# Export the model for inference
learn.export('learner2')

# Load the model for inference
model = load_learner('learner2')

# test_df.drop(["customer_id","firstname","lastname"], axis = 1, inplace=True)
# Assuming test_df contains your test data
test_dl = learn.dls.test_dl(test_df)

In [None]:
preds_proba, _ = learn.get_preds(dl=test_dl)

preds = (preds_proba[:, 0] > 0.5).to(torch.int)

test["final_close_flag"] = preds
test["final_close_flag"].head()

# Submission Tabular Learner

In [None]:
submission_df = test[["primary_close_flag","final_close_flag"]]
submission_df.head()

In [None]:
submission_df.to_csv('my_submission_final.csv', index=False)

In [None]:
! tree /kaggle