In [1]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from lib.preprocess import Preprocess, data_augmentation
from sklearn.manifold import TSNE
from sklearn.utils import compute_sample_weight
# import shap

In [2]:
SEED = 314
datasrc = "data/official/"
data = pd.read_csv(os.path.join(datasrc, "train.csv"), index_col=0)
x_test = pd.read_csv(os.path.join(datasrc, "test.csv"), index_col=0)

train, valid = train_test_split(data, test_size=0.2, random_state=42, stratify=data["health"])

In [3]:
x_train = train.drop("health", axis=1)
y_train = train["health"]
x_valid = valid.drop("health", axis=1)
y_valid = valid["health"]

In [4]:
ignore_columns = [
    "nta_name",
    "boro_ct",
    "spc_latin",
]
# get object columns
object_columns = [col for col in x_train.select_dtypes(include=["object"]).columns.tolist() if col not in ignore_columns]
config = {
    "object_columns": object_columns,
    "is_target_encode": False,
}

In [5]:
preprocess = Preprocess(config)

x_train = pd.DataFrame(preprocess.fit_transform(x_train, y_train), columns=preprocess.get_feature_names_out(), index=x_train.index)
x_valid = pd.DataFrame(preprocess.transform(x_valid), columns=preprocess.get_feature_names_out(), index=x_valid.index)
x_test = pd.DataFrame(preprocess.transform(x_test), columns=preprocess.get_feature_names_out(), index=x_test.index)

x_train.loc[:,"num__tree_dbh_binedby5"] = x_train["num__tree_dbh"].map(lambda x: np.floor(x/5)*5)
x_train.loc[:,"num__tree_dbh_binedby10"] = x_train["num__tree_dbh"].map(lambda x: np.floor(x/10)*10)
x_valid.loc[:,"num__tree_dbh_binedby5"] = x_valid["num__tree_dbh"].map(lambda x: np.floor(x/5)*5)
x_valid.loc[:,"num__tree_dbh_binedby10"] = x_valid["num__tree_dbh"].map(lambda x: np.floor(x/10)*10)
x_test.loc[:,"num__tree_dbh_binedby5"] = x_test["num__tree_dbh"].map(lambda x: np.floor(x/5)*5)
x_test.loc[:,"num__tree_dbh_binedby10"] = x_test["num__tree_dbh"].map(lambda x: np.floor(x/10)*10)

In [6]:
x_train = x_train.fillna(-10)
x_valid = x_valid.fillna(-10)
x_test = x_test.fillna(-10)

In [7]:
y_train.value_counts()

1    12601
0     2828
2      558
Name: health, dtype: int64

In [8]:
x_train_0 = x_train.loc[y_train==0,:]
x_train_1 = x_train.loc[y_train==1,:]
x_train_2 = x_train.loc[y_train==2,:]
len(x_train_0), len(x_train_1), len(x_train_2)

(2828, 12601, 558)

In [9]:
train_0s = train_test_split(x_train_0, test_size=0.5, random_state=SEED)
folds = KFold(n_splits=10, shuffle=True, random_state=SEED, )

In [10]:
def mean_f1score(preds:np.ndarray,eval_data: lgb.Dataset):
    y_true = eval_data.get_label()
    weight = eval_data.get_weight()
    preds = preds.reshape(len(np.unique(y_true)), -1)
    preds = preds.argmax(axis = 0)
    f1 = f1_score(y_true,preds,average='macro',sample_weight=weight)
    return 'f1',f1,True


def get_each_fold_model(index_1):
    valid_probs_list = list()
    for x_train_0_ in train_0s:
        x_fold_train = pd.concat([x_train_0_, x_train_1.iloc[index_1,:], x_train_2])
        y_fold_train = pd.concat([y_train.loc[x_train_0_.index], y_train.loc[x_train_1.iloc[index_1,:].index], y_train.loc[x_train_2.index]])
        x_fold_train, y_fold_train = data_augmentation(x_fold_train, y_fold_train)
        object_columns = x_fold_train.select_dtypes(include=["object"]).columns.tolist()
        lgb_train = lgb.Dataset(x_fold_train, y_fold_train, categorical_feature=object_columns)
        params = {
            'objective': 'multiclass',
            'metric': None,
            'num_class': 3,
            'seed': 42,
            'num_threads': -1,
            "force_col_wise": True,
        }
        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train],
            valid_names=['train'],
            num_boost_round=1000,
            # early_stopping_rounds=100,
            # verbose_eval=100,
            feval=mean_f1score,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                # lgb.reset_parameter(learning_rate=lambda current_round: 0.01 * 0.995 ** current_round),
            ]
        )
        valid_probs = model.predict(x_valid)
        pred_valid = pd.DataFrame(valid_probs.argmax(axis=1), columns=["health"], index=x_valid.index)
        # print(pred_valid.value_counts()/len(pred_valid))
        # print("f1 score",f1_score(y_valid, pred_valid, average="macro"))
        # print("-"*50)
        valid_probs_list.append(valid_probs)
    return valid_probs_list

In [11]:
models = list()
valid_probs_list = list()
pred_valid_list = list()
for i,(_, train_index) in enumerate(folds.split(x_train_1)):
    print(f"fold {i}")
    valid_probs0, valid_probs1 = get_each_fold_model(train_index)
    pred_valid0 = pd.DataFrame(valid_probs0.argmax(axis=1), columns=[f"fold{i}_health"], index=x_valid.index)
    pred_valid1 = pd.DataFrame(valid_probs1.argmax(axis=1), columns=[f"fold{i}_health"], index=x_valid.index)
    valid_probs0 = pd.DataFrame(valid_probs0, columns=[f"fold{i}_class0", f"fold{i}_class1", f"fold{i}_class2"], index=x_valid.index)
    valid_probs1 = pd.DataFrame(valid_probs1, columns=[f"fold{i}_class0", f"fold{i}_class1", f"fold{i}_class2"], index=x_valid.index)
    # valid_probs_list.append(valid_probs0.copy())
    # valid_probs_list.append(valid_probs1.copy())
    pred_valid_list.append(pred_valid0.copy())
    pred_valid_list.append(pred_valid1.copy())
    # models.append(model)
    # models.append(model)

fold 0
Before SMOTE:  3233
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6445
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3233
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6457
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 1
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6459
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6466
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 2
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6464
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6465
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 3
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6454
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6436
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 4
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6444
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6457
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 5
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6449
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6461
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 6
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6464
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6473
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 7
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6424
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6412
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 8
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6452
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6457
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




fold 9
Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6443
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Before SMOTE:  3232
After SMOTE:  4242
[LightGBM] [Info] Total Bins 6470
[LightGBM] [Info] Number of data points in the train set: 4242, number of used features: 37
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




In [12]:
# folds_valid_probs = pd.concat(valid_probs_list, axis=1)
folds_valid_preds = pd.concat(pred_valid_list, axis=1)
folds_predict = folds_valid_preds.apply(lambda x: x.value_counts()/len(x), axis=1).idxmax(axis=1)
folds_predict.value_counts()/len(folds_predict)

0    0.556668
1    0.375281
2    0.068051
dtype: float64

In [13]:
f1_score(y_valid, folds_predict, average="macro")

0.30436063123340756