In [2]:
# import libraries

import sys
import os
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print(sys.version)

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv
3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37) 
[GCC 9.3.0]


In [3]:
# load train & test data

X = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col='id')
X_test = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv', index_col='id')

y = X['target']
X.drop(['target'], axis=1, inplace=True)

# break off validation set from training data

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# separate categoricals & numericals

cat_cols = [cname for cname in X_train if X_train[cname].dtype == 'object']
num_cols = [cname for cname in X_train if X_train[cname].dtype in ['int64','float64']]

In [4]:
# scale num data with StandardScaler

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
num_train = X_train[num_cols]
num_valid = X_valid[num_cols]
num_test = X_test[num_cols]

scaled_train = pd.DataFrame(sc.fit_transform(num_train), index=num_train.index, columns=num_train.columns)
scaled_valid = pd.DataFrame(sc.transform(num_valid), index=num_valid.index, columns=num_valid.columns)
scaled_test = pd.DataFrame(sc.transform(num_test), index=num_test.index, columns=num_test.columns)

# encode cat data with OrdinalEncoder

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
cat_train = X_train[cat_cols]
cat_valid = X_valid[cat_cols]
cat_test = X_test[cat_cols]

encoded_train = pd.DataFrame(ordinal_encoder.fit_transform(cat_train), index=cat_train.index, columns=cat_train.columns)
encoded_valid = pd.DataFrame(ordinal_encoder.transform(cat_valid), index=cat_valid.index, columns=cat_valid.columns)
encoded_test = pd.DataFrame(ordinal_encoder.transform(cat_test), index=cat_test.index, columns=cat_test.columns)

# merge each data

x_train = pd.concat([scaled_train, encoded_train], axis=1)
x_valid = pd.concat([scaled_valid, encoded_valid], axis=1)
x_test = pd.concat([scaled_test, encoded_test], axis=1)

In [9]:
%%time

# build lgbm model

import lightgbm as lgb

print('lightgbm version: %s' % lgb.__version__)

train_ds = lgb.Dataset(x_train, label=y_train)
test_ds = lgb.Dataset(x_valid, label=y_valid)

# define parameters

params = {'objective': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse', 
          'force_col_wise': True,
          'learning_rate': 0.0042,
          'max_depth': 4,
          'max_bin': 1500,
          'num_leaves': 12, 
          'min_data_in_leaf': 580,
          'feature_fraction': 0.2, 
          'bagging_fraction': 0.7,
          'bagging_freq': 3,
          'scale_pos_weight': 1.1,
          'reg_alpha': 10,
          'reg_lambda': 10,
          'cat_smooth': 3,
          'seed':2021}

model = lgb.train(params, train_ds, 50000, test_ds, verbose_eval=100, 
                  early_stopping_rounds=200)

lightgbm version: 3.2.1
[LightGBM] [Info] Total Bins 21056
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 24
[LightGBM] [Info] Start training from score 8.242729
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 0.745951
[200]	valid_0's rmse: 0.744328
[300]	valid_0's rmse: 0.742768
[400]	valid_0's rmse: 0.741472
[500]	valid_0's rmse: 0.740205
[600]	valid_0's rmse: 0.739146
[700]	valid_0's rmse: 0.738157
[800]	valid_0's rmse: 0.73725
[900]	valid_0's rmse: 0.736459
[1000]	valid_0's rmse: 0.735787
[1100]	valid_0's rmse: 0.73515
[1200]	valid_0's rmse: 0.734559
[1300]	valid_0's rmse: 0.734046
[1400]	valid_0's rmse: 0.733546
[1500]	valid_0's rmse: 0.733034
[1600]	valid_0's rmse: 0.732567
[1700]	valid_0's rmse: 0.732147
[1800]	valid_0's rmse: 0.731737
[1900]	valid_0's rmse: 0.731353
[2000]	valid_0's rmse: 0.731003
[2100]	valid_0's rmse: 0.730696
[2200]	valid_0's rmse: 0.730384
[2300]	valid_0's rmse: 0.730085
[2400]	

In [10]:
predict_valid = model.predict(x_valid)

from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_valid, predict_valid, squared=False)
print('root mean squared error: ', rmse)

root mean squared error:  0.7186930818080799


In [11]:
preds_test = model.predict(x_test)

output = pd.DataFrame({'id': X_test.index,
                       'target': preds_test})
output.to_csv('submission.csv', index=False)