## Lightboost

In [None]:
import numpy as np
import pandas as pd

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
df_train = import_data('../input/tabular-playground-series-dec-2021/train.csv')
df_test = import_data('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
df_test = df_test.set_index('Id')
df_test.head()

In [None]:
df_train = df_train.set_index('Id')
df_train.head()

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train['Cover_Type'].value_counts()

In [None]:
df_train = df_train[df_train['Cover_Type'] != 5]
df_train

In [None]:
X = df_train.drop('Cover_Type', axis=1)
X.head()

In [None]:
y = df_train['Cover_Type']
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y)


In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [None]:
df_train['Cover_Type'].unique()

## Optuna

In [None]:
import optuna

In [None]:
def objective(trial):
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.20, stratify = y)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dtest = lgb.Dataset(valid_x, label=valid_y)
    param = {
        "is_unbalance": True,
        'objective': 'multiclass',
        "num_class": 8, #df_train['Cover_Type'].nunique()+1
        'metric': "multi_logloss",
        'verbosity': -1,
        "num_threads": -1,
        'num_iterations': 1000,
        'n_estimators': 7000,
        "learning_rate": trial.suggest_float('learning_rate',0.01,0.2),
        'boosting_type': trial.suggest_categorical('boosting',['gbdt']),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2,15),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0)
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 7)
    }
    
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    
    gbm = lgb.train(param, dtrain, valid_sets= [dtest])
    y_pred = gbm.predict(valid_x)
    test_preds = [np.argmax(x) for x in y_pred]
    accuracy = accuracy_score(valid_y, test_preds)
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=100, timeout=600)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

In [None]:
study.trials_dataframe()

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.20, stratify = y)
dtrain = lgb.Dataset(train_x, label=train_y)
dtest = lgb.Dataset(valid_x, label=valid_y)

params = {
    'learning_rate': 0.05992665960877217,
    'boosting': 'gbdt',
    'lambda_l2': 2.537222734343992, 
    'num_leaves': 225, 'max_depth': 10,
    "is_unbalance" : True,
    'objective': 'multiclass',
    "num_class": 8, #df_train['Cover_Type'].nunique()+1
    'metric': "multi_logloss",
    'verbosity': -1,
    "num_threads": -1,
    'num_iterations': 1000
    }

gbm = lgb.train(params, dtrain, valid_sets = [dtest])
y_pred = gbm.predict(valid_x)
test_preds = [np.argmax(x) for x in y_pred]
print(accuracy_score(valid_y, test_preds))

In [None]:
pred = gbm.predict(df_test)

In [None]:
pred = [np.argmax(x) for x in pred]

In [None]:
df_subb = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
df_subb['Cover_Type'] = pred

In [None]:
df_subb = df_subb.set_index('Id')

In [None]:
df_subb

In [None]:
df_subb.to_csv('lgb_subb.csv')