In [None]:
import time
import warnings

import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor

# LGBM

In [None]:
# The EDA is highly based on the discussion of this post: https://www.kaggle.com/c/liberty-mutual-fire-peril/discussion/10194

def read_data(folder):
    train = pd.read_csv(f"{folder}/train.csv.zip")
    test = pd.read_csv(f"{folder}/test.csv.zip")
    sub = pd.read_csv(f"{folder}/sampleSubmission.csv.zip")
    return train, test, sub

def preprocess(train, test):
    X_train = train.copy()
    y_train = X_train["target"].copy()
    X_train.drop("target", axis=1, inplace=True)
    X_train = _kazanova_feature_selection(X_train)

    X_test = test.copy()
    X_test = _kazanova_feature_selection(test)
    return X_train, y_train, X_test

def _kazanova_feature_selection(df):
    reduced = df.copy()
    VARS = [f"var{i}" for i in range(1, 18)]
    CRIMES = ["crimeVar2", "crimeVar4", "crimeVar7"]
    GEODEM = ["geodemVar1"]
    WEATHER_NUMS = [1, 4, 6, 41, 43, 77, 79, 113, 147, 149, 181, 199, 209, 227]
    WEATHER = [f"weatherVar{num}" for num in WEATHER_NUMS]
    USABLE_VARIABLES = VARS + CRIMES + GEODEM + WEATHER
    return reduced[USABLE_VARIABLES]

def preprocess_lgbm(X_train, X_test):
    TO_INT_KEYS = ["var7", "var8", "var9"]
    tmp_train = X_train.copy()
    tmp_test = X_test.copy()
    for TO_INT_KEY in TO_INT_KEYS:
        to_int_train = pd.to_numeric(tmp_train[TO_INT_KEY], errors='coerce')
        to_int_test = pd.to_numeric(tmp_test[TO_INT_KEY], errors='coerce')
        tmp_train[TO_INT_KEY] = to_int_train
        tmp_test[TO_INT_KEY] = to_int_test
    return pd.get_dummies(tmp_train), pd.get_dummies(tmp_test)

if __name__ == "__main__":
    
    #Defining the size of the grid
    print("Starting loading data")
    train, test, submission = read_data("input")
    print("Finished loading data")

    X_train_p, y_train_p, X_test_p = preprocess(train, test)
    X_train = X_train_p.fillna(0)
    X_test = X_test_p.fillna(0)

    X_train, X_test = preprocess_lgbm(X_train_p, X_test_p)
    X_train.fillna(0, inplace=True)
    y_train = y_train_p.copy()
    X_test.fillna(0, inplace=True)


In [None]:
def make_sub(model, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    submission["target"] = preds
    submission.to_csv(name, index=False)

# Best LGBM (Private Gini: 0.31866)
params = {
    'max_depth': 3,
    'verbose': -1,
    'random_state': 314,
    'n_estimators': 500,
    'min_child_samples': 922,
    'learning_rate': 0.02490956806161569,
    'num_leaves': 46,
    'colsample_bytree': 0.15790102015824806,
    'reg_alpha': 0.00010520322254335312,
    'objective': 'regression',
    'reg_lambda': 739.5182966148433,
    'subsample': 0.9219752127591361
}
lgbm = LGBMRegressor(**params)
name = f"best_lgbm.csv"
make_sub(lgbm, name)