In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split

import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = "/kaggle/input/tabular-playground-series-dec-2021/"
train = "train.csv"
test = "test.csv"
submission = "sample_submission.csv"

In [None]:
train_df = pd.read_csv(path+train)
train_df.head()

In [None]:
train_df["Cover_Type"].value_counts()

In [None]:
def check_missing_col(df):
    cnt_missing_col = 0
    for idx, col in enumerate(df.columns):
        missing_values = sum(df[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            cnt_missing_col +=1
            print(f"null column is : {col}")
            print(f"total missing value : {missing_values}")
        
        if idx == len(df.columns) - 1 and cnt_missing_col == 0:
            print("no missing cols")

In [None]:
check_missing_col(train_df)

In [None]:
# the gap between classes is so huge
# I choose to make weight balance between 
train_df["Cover_Type"].value_counts().sort_values().plot(kind="barh", color="#FF7F50")

In [None]:
train_df.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler

continuous_names = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']


# PowerTransformer -> best performance
scaler = MinMaxScaler()

def scale(df):
    train_scaler = scaler.fit_transform(df)
    df = pd.DataFrame(data=train_scaler, columns=df.columns)
    
    return df

In [None]:
X = train_df.drop(['Id', 'Cover_Type'], axis=1)
X = scale(X)
Y = train_df['Cover_Type']
X_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
# print(class_weights)
weights = {1 : 3.89057751e-01, 2 : 2.52647748e-01, 3 : 2.92446028e+00, 4 : 1.54440154e+03,
          5 : 4.57142857e+05, 6 : 4.99336818e+01, 7 : 9.17533784e+00}

In [None]:
from lightgbm import LGBMClassifier
# lgbm1 -> default parameter of LGBM

lgb_params = {'n_estimators'     : 10000,      # Number of boosting iterations.
              'random_state'     : 42,            # Random seed initilizer for the model, helps to replicate the experiments.
              'learning_rate'    : 0.1,              # The model learning rate.
              'subsample'        : 0.95,            # Row subsample from the dataset, like feature_fraction, but this will randomly select part of data without resampling
              'subsample_freq'   : 1,               # Use or not subsample frequency.
              'colsample_bytree' : 0.75,            # LightGBM will randomly select a subset of features on each iteration (tree).
              'reg_alpha'        : 0.5,             # L1 regularization.
              'reg_lambda'       : 0.5,             # L2 regularization.
              'min_child_weight' : 1e-3,            # Minimal sum hessian in one leaf, it can be used to deal with over-fitting.
              'min_child_samples': 32,              # Minimal number of data in one leaf. Can be used to deal with over-fitting.
              'objective'        : 'multiclass',    # Softmax objective function.
              'metric'           : 'multi_logloss', # Log loss for multi-class classification.
              'device_type'      : 'gpu',
             }   

lgbm1 = LGBMClassifier(class_weight=weights, **lgb_params)

In [None]:
lgbm1.fit(X_train, y_train,
          early_stopping_rounds=50,
          eval_set=[(x_val, y_val)],
          eval_metric='logloss',
          verbose=1)

In [None]:
lgbm1_pred = lgbm1.score(x_val, y_val)
lgbm1_pred

In [None]:
test_df = pd.read_csv(path+test)
test_df.head()

In [None]:
check_missing_col(test_df)

In [None]:
test_df = test_df.drop(["Id"], axis=1)
test_df = scale(test_df)

In [None]:
lgbm1_test = lgbm1.predict(test_df)
lgbm1_test

In [None]:
submit = pd.read_csv(path+submission)
submit.head()

In [None]:
submit["Cover_Type"] = lgbm1_test

In [None]:
submit.to_csv("submission.csv", index=False)

In [None]:
result = pd.read_csv("submission.csv")
result["Cover_Type"].value_counts()

In [None]:
print("submission.csv file created")