In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor
import sklearn
import pandas as pd
import numpy as np

In [6]:
train = TabularDataset("train.csv")
test = TabularDataset("test.csv")
label = 'Rings'
#predictor = TabularPredictor.load("../MachinLearningGrading/AutogluonModels/ag-20240417_070035")


In [7]:
def get_train_data():
    # get data
    data = pd.read_csv('train.csv')
    X = data.drop(['Rings','id'], axis=1)
    #transform the "sex" column into numerical values
    X['Sex'] = X['Sex'].map({'M': 0, 'F': 1, 'I': 2})

    y = data['Rings']
    return X, y

def get_valid_data():
    # get data
    data = pd.read_csv('test.csv')
    
    X = data.drop(['id'], axis=1)
    X['Sex'] = X['Sex'].map({'M': 0, 'F': 1, 'I': 2})
    return X

In [10]:
from typing import List
# lgbm = LGBMClassifier(verbose=-1)
# cat = CatBoostClassifier(verbose=False)
# et = ExtraTreesClassifier()
from sklearn.ensemble import StackingRegressor,BaggingRegressor
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVR
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold,train_test_split
import pickle
from sklearn.metrics import log_loss
#Base models: XGB, LightGBM, CatBoost, ExtraTrees, RandomForest
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor,early_stopping,log_evaluation
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import sklearn
from optuna.integration import CatBoostPruningCallback
import sklearn.metrics

class EnsembleModel:
    # EnsembleModel for binary classification, using a list of models,output using voting

    def __init__(self):
        xgbc = XGBRegressor()
        lgbm = LGBMRegressor(verbose=-1)
        cat = CatBoostRegressor(verbose=False)
        et = ExtraTreesRegressor()
        #rf = RandomForestRegressor()
        #svm = SVR()
        self.X, self.y = get_train_data()
        self.valid_X = get_valid_data()
        self.models = [xgbc, lgbm, cat, et]
        self.opt_models = {
            "XGB": (xgbc, None),
            "LGBM": (lgbm, None),
            "CatBoost": (cat, None),
            "ExtraTrees": (et, None),
            #"RandomForest": (rf, None),
            #"SVM":(svm,None)

        }
        self.hyperparams = {
            "XGB": {},
            "LGBM": {},
            "CatBoost": {},
            "ExtraTrees": {},

        }
        self.n_trials = 50
        
    def bagging_fit(self,model):
        #base models
        bag_model = BaggingRegressor(estimator=model,n_estimators=10)
        #bag_model.fit(X,y)
        return bag_model

    def stacking_fit(self):
        # Define the base models
        base_models = [
            ('rf', RandomForestRegressor()),
            ('xgb', XGBRegressor()),
            ('lgbm', LGBMRegressor()),
            ('catboost', CatBoostRegressor(verbose=0)),
            ('et', ExtraTreesRegressor()),
            #('svm', SVC(probability=True))

        ]
        # Define the stacking model
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())
        stacking_model.fit(self.X, self.y)
        
        #print RMSLE
        print(f" Model: Stacking, RMSLE: {np.sqrt(sklearn.metrics.mean_squared_error(self.y, stacking_model.predict(self.X)))}")

        #multi-level
        # level1 = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())
        # level2 = StackingClassifier(estimators=[('level1', level1)], final_estimator=LogisticRegression())
        # level2.fit(self.X, self.y)
        # print(f" Model: Multi-level Stacking, log_loss: {log_loss(self.y, level2.predict_proba(self.X))}")
        # print(np.mean(level2.predict(self.valid_X) == self.valid_y))

    def bag_and_stack_fit(self):
        base_models = [
            ('rf', RandomForestRegressor()),
            ('xgb', XGBRegressor()),
            ('lgbm', LGBMRegressor()),
            ('catboost', CatBoostRegressor(verbose=0)),
            ('et', ExtraTreesRegressor()),
            #('svm', SVC(probability=True))

        ]
        # Define the stacking model
        bagged_models = [(name,self.bagging_fit(model)) for name,model in base_models]
        stacking_model = StackingRegressor(estimators=bagged_models, final_estimator=LinearRegression(),cv=5,verbose=1)
        #l2_model = StackingClassifier(estimators=[("level1",stacking_model)], final_estimator=LogisticRegression())
        stacking_model.fit(self.X, self.y)
        sub = pd.DataFrame()
        sub['id'] = pd.read_csv('test.csv')['id']
        sub['Rings'] = stacking_model.predict(self.valid_x)
        sub.to_csv('submission_new_B&S.csv', index=False)
        #print RMSLE
        print(f" Model: B&S, RMSLE: {np.sqrt(sklearn.metrics.mean_squared_error(self.y, stacking_model.predict(self.X)))}")
    def baseline_fit(self):
        
        print("Start baseline training")
        print("===============================")
        for model_name, (model, _) in self.opt_models.items():
            
            model.fit(self.X, self.y)
            #print RMSLE
            print(f" Model: {model_name}, RMSLE: {np.sqrt(sklearn.metrics.mean_squared_error(self.y, model.predict(self.X)))}")
            #print(f" Model: {model_name}, log_loss: {log_loss(self.y, model.predict_proba(self.X))}")

    def validation_test(self):
        print("Start validation test")
        print("===============================")
        print(np.mean(self.predict(self.valid_X) == self.valid_y))
        print(f"  RMSLE: {np.sqrt(sklearn.metrics.mean_squared_error(self.valid_y, self.stacking_model.predict(self.valid_X)))}")
            
        

    def opt_fit(self, X, y):
        print("Start opt training")
        print("===============================")
        # get the best hyperparameters for each model

        train_X, test_X, train_y, test_y = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        for model_name, (model, model_obj) in self.opt_models.items():
            
            print("Now tuning model: ", model)
            optuna.logging.set_verbosity(optuna.logging.INFO)
            study = (
                optuna.create_study(
                    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
                    direction="maximize",
                )
            )

            study.optimize(
                lambda trial: model_obj(trial, train_X, train_y, test_X, test_y),
                n_trials=self.n_trials,
            )
            params = study.best_params
            self.hyperparams[model_name] = params

        # set the best hyperparameters for each model
        for model_name, (model, _) in self.opt_models.items():
            model.set_params(**self.hyperparams[model_name])
        #conduct k-fold
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            for model_name, (model, _) in self.opt_models.items():
                model.fit(X_train, y_train)
                
        
        #full data training
        # for model_name, (model, _) in self.opt_models.items():
        #     model.fit(X, y)
        #     print(f" Model: {model_name}, log_loss: {log_loss(y, model.predict_proba(X))}")

    def predict(self,X):
        predictions = []
        for model in self.models:
            predictions.append(model.predict(X))
        return self._voting(predictions)

    def _voting(self, predictions):
        # voting
        predictions = np.array(predictions)
        return np.mean(predictions, axis=0)

In [11]:
new_ens = EnsembleModel()

In [12]:
new_ens.X

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,1,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400
1,1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200
2,2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050
3,0,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500
4,2,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975
...,...,...,...,...,...,...,...,...
90610,0,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450
90611,0,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400
90612,2,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815
90613,2,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700


In [12]:
new_ens.baseline_fit()

Start baseline training
 Model: XGB, RMSLE: 1.6345992488036352
 Model: LGBM, RMSLE: 1.7771473342081994
 Model: CatBoost, RMSLE: 1.7124457017013728
 Model: ExtraTrees, RMSLE: 0.0


In [13]:
new_ens.bag_and_stack_fit()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1334
[LightGBM] [Info] Number of data points in the train set: 90615, number of used features: 8
[LightGBM] [Info] Start training from score 9.686685
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1334
[LightGBM] [Info] Number of data points in the train set: 90615, number of used features: 8
[LightGBM] [Info] Start training from score 9.716394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1334
[LightGBM] [Info] Number of data points in the train set: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [13]:
valid_x = get_valid_data()

In [95]:
new_ens.predict(valid_x)

array([ 9.84473773,  9.75041905, 10.12277832, ..., 12.1984063 ,
       13.11807686,  8.70629345])

In [14]:
#submmit to kaggle
sub = pd.DataFrame()
sub['id'] = pd.read_csv('test.csv')['id']
sub['Rings'] = new_ens.predict(valid_x)
sub.to_csv('submission_baseline_bo.csv', index=False)

In [3]:
train.drop("id", axis=1, inplace=True)

In [4]:
train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [5]:
predictor = TabularPredictor(label=label,).fit(train,num_gpus=1,auto_stack=True,presets='best_quality')


No path specified. Models will be saved in: "AutogluonModels/ag-20240418_084924"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240418_084924/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels/ag-20240418_084924/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.11.8
Operating System:   D

In [8]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.184083,RMSLE,12.129352,2449.556508,0.006404,11.871436,3,True,13
1,NeuralNetFastAI_BAG_L2,-0.18424,RMSLE,12.122948,2437.685072,0.527626,1009.712431,2,True,12
2,WeightedEnsemble_L2,-0.186944,RMSLE,4.896739,43.776948,0.007166,11.607411,2,True,11
3,NeuralNetFastAI_BAG_L1,-0.187148,RMSLE,0.321379,27.530098,0.321379,27.530098,1,True,3
4,RandomForestGini_BAG_L1,-0.190113,RMSLE,2.323589,2.015158,2.323589,2.015158,1,True,6
5,RandomForestEntr_BAG_L1,-0.190456,RMSLE,2.244605,2.624281,2.244605,2.624281,1,True,7
6,CatBoost_BAG_L1,-0.192579,RMSLE,0.112025,1383.262495,0.112025,1383.262495,1,True,8
7,ExtraTreesEntr_BAG_L1,-0.197683,RMSLE,2.082329,0.970308,2.082329,0.970308,1,True,10
8,ExtraTreesGini_BAG_L1,-0.197818,RMSLE,1.942887,0.880103,1.942887,0.880103,1,True,9
9,KNeighborsDist_BAG_L1,-0.211252,RMSLE,0.5222,0.063767,0.5222,0.063767,1,True,2


In [6]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.373801,accuracy,46.042385,2174.508298,0.007006,10.640719,2,True,9
1,WeightedEnsemble_L3,0.373029,accuracy,47.681648,3113.413801,0.006487,10.890823,3,True,11
2,NeuralNetFastAI_BAG_L2,0.372808,accuracy,47.675161,3102.522978,0.609898,938.547141,2,True,10
3,NeuralNetFastAI_BAG_L1,0.370568,accuracy,0.298611,26.451217,0.298611,26.451217,1,True,3
4,LightGBMXT_BAG_L1,0.370303,accuracy,39.170374,44.61013,39.170374,44.61013,1,True,4
5,LightGBM_BAG_L1,0.366727,accuracy,2.091293,5.085716,2.091293,5.085716,1,True,5
6,CatBoost_BAG_L1,0.364531,accuracy,0.064294,2083.120465,0.064294,2083.120465,1,True,8
7,RandomForestEntr_BAG_L1,0.36153,accuracy,2.192798,2.588978,2.192798,2.588978,1,True,7
8,RandomForestGini_BAG_L1,0.360481,accuracy,2.218009,2.011073,2.218009,2.011073,1,True,6
9,KNeighborsUnif_BAG_L1,0.306704,accuracy,0.512351,0.053654,0.512351,0.053654,1,True,1


In [7]:
res = predictor.predict(test,model="WeightedEnsemble_L2")

In [8]:
test['Rings'] = res
test[['id', 'Rings']].to_csv("submission2.csv", index=False)