In [1]:
!pip install -q -U autogluon

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.13.0 requires aiohttp<4.0.0,>=3.9.2, but you have aiohttp 3.9.1 which is incompatible.
aiobotocore 2.13.0 requires botocore<1.34.107,>=1.34.70, but you have botocore 1.29.165 which is incompatible.
albumentations 1.4.0 requires scikit-image>=0.21.0, but you have scikit-image 0.20.0 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
torchaudio 2.1.2+cpu requires torch==2.1.2, but you have torch 2.3.1 which is incompatible.
torchtext 0.16.2+cpu requires torch==2.1.2, but you have torch 2.3.1 which is incompatible.
ydata-profiling 4.6.4 requires numpy<1.26,>=1.16.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [2]:
from autogluon.tabular import TabularPredictor, TabularDataset
import pandas as pd
import numpy as np
import pickle
import shutil

# Configs

In [3]:
TARGET = 'Response'
TIME_LIMIT = 3600 * 10

TRAIN_PATH = '/kaggle/input/playground-series-s4e7/train.csv'
TEST_PATH = '/kaggle/input/playground-series-s4e7/test.csv'

# Loading and Processing Data

In [4]:
class DataLoader:
    def __init__(self, path):
        self.path = path

    @staticmethod
    def encode_categorical_features(dataframe):  
        print("--- Encoding categorical features")
        dataframe = dataframe.copy()
        
        gender_mapping = {"Male": 0, "Female": 1}
        vehicle_age_mapping = {"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}
        vehicle_damage_mapping = {'No': 0, 'Yes': 1}
        
        dataframe["Gender"] = dataframe["Gender"].map(gender_mapping)
        dataframe["Vehicle_Age"] = dataframe["Vehicle_Age"].map(vehicle_age_mapping)
        dataframe['Vehicle_Damage'] = dataframe['Vehicle_Damage'].map(vehicle_damage_mapping)
        
        return dataframe
    
    @staticmethod
    def convert_data_types(dataframe):  
        print("--- Converting data types")
        dataframe = dataframe.copy()
        
        dataframe["Region_Code"] = dataframe["Region_Code"].astype(int) 
        dataframe["Annual_Premium"] = dataframe["Annual_Premium"].astype(int) 
        dataframe["Policy_Sales_Channel"] = dataframe["Policy_Sales_Channel"].astype(int) 
        
        return dataframe
    
    @staticmethod
    def add_features(dataframe):  
        print("--- Adding new features")
        dataframe = dataframe.copy()
        
        # Reference: https://www.kaggle.com/code/rohanrao/automl-grand-prix-1st-place-solution
        dataframe['Previously_Insured_Annual_Premium'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Annual_Premium'].astype(str))[0]
        dataframe['Previously_Insured_Vehicle_Age'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vehicle_Age'].astype(str))[0]
        dataframe['Previously_Insured_Vehicle_Damage'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vehicle_Damage'].astype(str))[0]
        dataframe['Previously_Insured_Vintage'] = pd.factorize(dataframe['Previously_Insured'].astype(str) + dataframe['Vintage'].astype(str))[0]
        
        return dataframe
    
    @staticmethod
    def reduce_mem_usage(dataframe):
        # Reference: https://www.kaggle.com/competitions/playground-series-s4e7/discussion/516103#2899151
        
        print("--- Reducing memory usage")
        dataframe = dataframe.copy()
        initial_mem_usage = dataframe.memory_usage().sum() / 1024**2
        
        for col in dataframe.columns:
            col_type = dataframe[col].dtype

            if col_type.name in ['category', 'object']:
                raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataframe[col] = dataframe[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                else:
                    dataframe[col] = dataframe[col].astype(np.float64)

        final_mem_usage = dataframe.memory_usage().sum() / 1024**2
        print('------ Memory usage before: {:.2f} MB'.format(initial_mem_usage))
        print('------ Memory usage after: {:.2f} MB'.format(final_mem_usage))
        print('------ Decreased memory usage by {:.1f}%'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))
        print(f"\nDataset size: {dataframe.shape}\n\n")

        return dataframe


    def load(self):
        print(f"Loading data from {self.path}")
        dataframe = pd.read_csv(self.path, index_col="id")
        
        dataframe = self.encode_categorical_features(dataframe)
        dataframe = self.convert_data_types(dataframe)
        dataframe = self.add_features(dataframe)
        dataframe = self.reduce_mem_usage(dataframe)
        
        return TabularDataset(dataframe)

In [5]:
train = DataLoader(TRAIN_PATH).load()
test = DataLoader(TEST_PATH).load()

Loading data from /kaggle/input/playground-series-s4e7/train.csv
--- Encoding categorical features
--- Converting data types
--- Adding new features
--- Reducing memory usage
------ Memory usage before: 1404.39 MB
------ Memory usage after: 351.10 MB
------ Decreased memory usage by 75.0%

Dataset size: (11504798, 15)


Loading data from /kaggle/input/playground-series-s4e7/test.csv
--- Encoding categorical features
--- Converting data types
--- Adding new features
--- Reducing memory usage
------ Memory usage before: 877.75 MB
------ Memory usage after: 226.75 MB
------ Decreased memory usage by 74.2%

Dataset size: (7669866, 14)




# Training

In [6]:
predictor = TabularPredictor(
    label=TARGET,
    eval_metric='roc_auc',
    verbosity=2
).fit(
    train_data=train,
    time_limit=TIME_LIMIT, 
    presets='best_quality'
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240703_184453"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 27 20:43:36 UTC 2024
CPU Count:          4
Memory Avail:       29.61 GB / 31.36 GB (94.4%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to 

In [7]:
predictor.leaderboard().style.background_gradient('RdYlGn', subset=['score_val'])

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,0.880765,roc_auc,5795.993498,23779.364881,1158.438339,7214.802805,2,True,7
1,WeightedEnsemble_L3,0.880765,roc_auc,5800.419006,23828.161579,4.425508,48.796698,3,True,9
2,LightGBM_BAG_L2,0.879145,roc_auc,4731.452361,17738.542772,93.897202,1173.980696,2,True,8
3,WeightedEnsemble_L2,0.877849,roc_auc,4639.394775,16459.692617,4.248143,34.54168,2,True,6
4,LightGBM_BAG_L1,0.873157,roc_auc,271.986627,2000.109174,271.986627,2000.109174,1,True,4
5,LightGBMXT_BAG_L1,0.871993,roc_auc,3977.139974,14332.783266,3977.139974,14332.783266,1,True,3
6,KNeighborsUnif_BAG_L1,0.758307,roc_auc,193.432872,53.089103,193.432872,53.089103,1,True,1
7,KNeighborsDist_BAG_L1,0.751273,roc_auc,192.587159,39.169394,192.587159,39.169394,1,True,2
8,CatBoost_BAG_L1,0.726313,roc_auc,2.408528,139.411138,2.408528,139.411138,1,True,5


In [8]:
oof_predictions = predictor.predict_proba_oof()
with open('autogluon_oof_pred_probs.pkl', 'wb') as f:
    pickle.dump(oof_predictions.values, f)

# Inference

In [9]:
test_pred_probs = predictor.predict_proba(test, as_multiclass=True)

In [10]:
with open('autogluon_test_pred_probs.pkl', 'wb') as f:
    pickle.dump(test_pred_probs.values, f)

In [11]:
sub = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')
sub['id'] = sub['id'].astype(np.int32)
sub[TARGET] = test_pred_probs.values[:, 1].astype(np.float16)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,Response
0,11504798,0.03772
1,11504799,0.048462
2,11504800,0.05658
3,11504801,0.000111
4,11504802,0.109314


In [12]:
shutil.rmtree('AutogluonModels')