In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# June 2021 TPS - Model 3: LightGBM with tuning
I previously tried XGBoost with tuning, with poor results. You'll see this in older versions of this notebook. I didn't want to spam notebooks out here so I just relegated my previous attempts to prior versions.

See my [previous notebook](https://www.kaggle.com/jdunavin/june-2021-tps-eda-and-modeling) for EDA and a basic model.

## Load and clean the data

In [None]:
train_X = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test_X = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")

In [None]:
# Any duplicates? - Yes, drop them all - we won't make any assumption which one is right
dupes = train_X.drop(['id','target'], axis=1).duplicated(keep=False)
print(dupes.value_counts())
dropthese = list(dupes[dupes == True].index)
train_X = train_X.drop(dropthese)

## Load some libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold # For creating folds
from sklearn.metrics import log_loss # Evaluation metrics

import optuna

Not a lot of commentary there because it turned out much worse. Not sure what, if anything, I might have done to make that turn out better. Let's try the non-random-forest version.

## LightGBM
Still have quite a bit of learning to do on this one.

In [None]:
train_X["kfold"] = -1
df = train_X.sample(frac=1,random_state=14000605).reset_index(drop=True)
y = df.target
kf = StratifiedKFold(n_splits=5)
for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
  df.loc[v_,"kfold"] = f

In [None]:
# My previous best guess - redone with stratified k-fold.
model_lgb = LGBMClassifier(
    objective = 'multiclass',
    reg_lambda = 10,
    learning_rate = 0.1,
    max_depth = 4,
    seed = 14000605,
    colsample_bytree = 0.5,
    subsample = 0.9,
    is_unbalance = True
    )
logloss = []
lgbm_pred = 0
for f in range(5): # Looping around 5 folds
    
    #Splitting the data into train and validation set
    train = df[df.kfold!= f].reset_index(drop=True) 
    valid = df[df.kfold== f].reset_index(drop=True)
    
    #Creating X_train and y_train
    X_train = train.drop(["id","target", "kfold"], axis=1)
    y_train = train.target
    X_valid = valid.drop(["id","target", "kfold"], axis=1)
    y_valid = valid.target
    X_test = test_X.drop(["id"], axis=1)
    
    
    #Fitting the model
    model_lgb.fit(X_train,y_train)
    
    #Predicting for valid and test datasets
    valid_preds = model_lgb.predict_proba(X_valid)
    lgbm_pred += model_lgb.predict_proba(X_test)/5
    
    #Calculating log loss
    logloss.append(log_loss(y_valid,valid_preds))
    
print(logloss)
print(sum(logloss)/len(logloss))

In [None]:
class_labels = sorted(train_X.target.value_counts().index)
lgbpreds = model_lgb.predict(test_X.drop('id',axis=1), num_iteration=model_lgb.best_iteration_)
lgbprods = model_lgb.predict_proba(test_X.drop('id',axis=1)) # used for submission
submission = pd.DataFrame(lgbprods, columns=class_labels, index=test_X.index + 200000)
submission.index.name = 'id'
submission.head()
submission.to_csv('submission_lgbtune.csv')

In [None]:
def optimize(trial):
    param = {
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'objective': 'multiclass',
        #'metric' : ''
        "random_state" : 42}



    model = LGBMClassifier(**param)
    logloss = []
    for f in range(5):
        train = df[df.kfold!= f].reset_index(drop=True)
        valid = df[df.kfold== f].reset_index(drop=True)

        X_train = train.drop(["id","target", "kfold"], axis=1)
        y_train = train["target"]
        X_valid = valid.drop(["id","target", "kfold"], axis=1)
        y_valid = valid["target"]

        model.fit(X_train,y_train)
        pred = model.predict_proba(X_valid)
        fold_logloss = log_loss(y_valid, pred)
        logloss.append(fold_logloss)
    
    return np.mean(logloss)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(optimize, n_trials=15)

In [None]:
print(study.best_params)

In [None]:
# Best guess after learning new tricks
model_lgb2 = LGBMClassifier(
    lambda_l1= 4.2986828029788605e-08, 
    lambda_l2= 2.290124579576187, 
    num_leaves= 6, 
    feature_fraction= 0.9832712273137326, 
    bagging_fraction= 0.41331491154011984, 
    bagging_freq= 1, 
    min_child_samples= 73,
    seed = 14000605
    )
logloss = []
lgbm_pred = 0
for f in range(5): # Looping around 5 folds
    
    #Splitting the data into train and validation set
    train = df[df.kfold!= f].reset_index(drop=True) 
    valid = df[df.kfold== f].reset_index(drop=True)
    
    #Creating X_train and y_train
    X_train = train.drop(["id","target", "kfold"], axis=1)
    y_train = train.target
    X_valid = valid.drop(["id","target", "kfold"], axis=1)
    y_valid = valid.target
    X_test = test_X.drop(["id"], axis=1)
    
    
    #Fitting the model
    model_lgb2.fit(X_train,y_train)
    
    #Predicting for valid and test datasets
    valid_preds = model_lgb2.predict_proba(X_valid)
    lgbm_pred += model_lgb2.predict_proba(X_test)/5
    
    #Calculating log loss
    logloss.append(log_loss(y_valid,valid_preds))
    
print(logloss)
print(sum(logloss)/len(logloss))

In [None]:
class_labels = sorted(train_X.target.value_counts().index)
lgbpreds = model_lgb2.predict(test_X.drop('id',axis=1), num_iteration=model_lgb.best_iteration_)
lgbprods = model_lgb2.predict_proba(test_X.drop('id',axis=1)) # used for submission
submission = pd.DataFrame(lgbprods, columns=class_labels, index=test_X.index + 200000)
submission.index.name = 'id'
submission.head()
submission.to_csv('submission_lgboptuna.csv')