In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read datasets to pandas dataframe
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')
df_sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv')

In [None]:
# Checking out df_train
df_train.describe()

In [None]:
# Lets see if we have any missing values
missing_values_train = df_train.isna().any().sum()
missing_values_test = df_test.isna().any().sum()
print(f'There are {missing_values_train} missing values in the train dataset')
print(f'There are {missing_values_test} missing values in the test dataset')

In [None]:
# Lets see which features are the most correlated with target
df_train.corr()['Cover_Type'].sort_values()

In [None]:
# Lets establish a baseline if we just always predict the target's most common class
# AKA: null accuracy
df_train['Cover_Type'].value_counts(normalize=True).head(1)

In [None]:
# How imbalanced are the class distrubutions in our target variable?
df_train.groupby('Cover_Type').size()

In [None]:
df_train = df_train[df_train['Cover_Type']!=5]

# Data Preprocessing

In [None]:
train=df_train
test=df_test

In [None]:
# Get train data without the target and ids
X = train.iloc[:, 1:-1].copy()
# Get the target
y = train.Cover_Type.copy()

# Create test X, drop ids.
test_X = test.iloc[:, 1:].copy()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validate, Y_train, Y_validate = train_test_split( X, y, test_size=0.2, random_state=2)
print ('Train set:', X_train.shape,  Y_train.shape)
print ('Validation set:', X_validate.shape,  Y_validate.shape)

In [None]:
X_train

# Modeling

## LGBM

In [None]:
# Create LightGBM model
from lightgbm import LGBMClassifier

lgb_params = {
    'objective' : 'multiclass',
    'metric' : 'multi_logloss',
    'device' : 'gpu',
}

lgbmmodel = LGBMClassifier(**lgb_params) 

lgbmmodel.fit(X_train,Y_train,
               early_stopping_rounds=200,
               eval_set=[(X_validate,Y_validate)],
               verbose=True)

# R^2 for training data
lgbmmodel.score(X_train,Y_train)

In [None]:
# View sample submission
df_sample_submission

In [None]:
# Rename df and replace the cover type column with our predictions
df_lgbm_submission = df_sample_submission
df_lgbm_submission['Cover_Type'] = lgbmmodel.predict(test_X).astype('int')
df_lgbm_submission.to_csv("submission.csv",index=False)