In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_path = '../input/tabular-playground-series-aug-2021'
train = pd.read_csv(os.path.join(data_path + '/train.csv'))
test = pd.read_csv(os.path.join(data_path + '/test.csv'))
sample_submission = pd.read_csv(os.path.join(data_path + '/sample_submission.csv'))
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
# num unique value
num_unique = {}
for i in train.columns:
    num_unique[i] = train[i].nunique()
#num_unique

In [None]:
num_unique = dict(sorted(num_unique.items(), key = lambda x: x[1]))
list(num_unique.items())[:3]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.kdeplot(train.loss)

In [None]:
train.describe()

## Modeling

In [None]:
y = train.pop('loss')

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(train,y, train_size = 0.8)


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)

In [None]:
from sklearn.metrics import mean_squared_error
pred = lin_reg.predict(x_train)
np.sqrt(mean_squared_error(pred,y_train))


In [None]:
pred = lin_reg.predict(x_valid)
np.sqrt(mean_squared_error(pred,y_valid))

## XGBoost

In [None]:
%%time
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_validate

reg = XGBRegressor(n_estimators=5000,
                   max_depth=5,
                   objective="reg:squarederror",
                   tree_method="gpu_hist",
                   learning_rate=0.1,
                   gamma=10)

eval_set = [(x_valid,y_valid)]
fit_params = {
    "eval_set": eval_set,
    "eval_metric": "rmse",
    "early_stopping_rounds": 100,
    "verbose": False,
}

kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_validate(
    reg,
    x_train,
    y_train,
    cv=kfold,
    scoring="neg_mean_squared_error",
    return_estimator=True,
    n_jobs=-1,
)

In [None]:
rmse = np.sqrt(-scores["test_score"].mean())
print(f"Base RMSE: {rmse:.5f}")

In [None]:
final_xgb = (
    pd.DataFrame(scores).sort_values("test_score", ascending=False)["estimator"].iloc[0]
)
final_xgb

In [None]:
final_xgb.fit(x_train,y_train)

In [None]:
np.sqrt(mean_squared_error(final_xgb.predict(x_valid), y_valid))

In [None]:
importance = final_xgb.get_booster().get_score(importance_type = 'gain')

In [None]:
list(dict(sorted(importance.items(), key = lambda x:x[1])))[-3:]

### LightGBM


In [None]:
import lightgbm as lgbm

In [None]:
def calc_model_importance(model, feature_names=None, importance_type='gain'):
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df
def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(8, 12)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()

In [None]:
params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 30,
      'learning_rate': 0.1,
      #'lambda_l1': 1,
      'lambda_l2': 1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
       'n_jobs':-1,
        'verbose':-1
    
  }

In [None]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 43)
models = []
scores = 0.0
gain_importance_list = []


In [None]:

for fold, (train_idx,val_idx) in enumerate(kfold.split(train,y)):
    print('Fold: ',fold)
    x_train, y_train = train.loc[train_idx], y[train_idx]
    x_val, y_val = train.loc[val_idx], y[val_idx]
    lgbm_train = lgbm.Dataset(x_train,y_train)
    lgbm_val = lgbm.Dataset(x_val,y_val,reference = lgbm_train)
    model = lgbm.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_val],
                      num_boost_round=5000,         
                      #feval=feval_RMSPE,
                      verbose_eval=100,
                      categorical_feature = ['id']                
                     )
    y_pred = model.predict(x_val, num_iteration=model.best_iteration)
    RMSE = np.sqrt(mean_squared_error(y_true = y_val, y_pred = y_pred))
    print('RMSE:',RMSE)
    scores += RMSE / 5
    models.append(model)
    print("*" * 100)
   
    feature_names = x_train.columns.values.tolist()
    gain_importance_df = calc_model_importance(
        model, feature_names=feature_names, importance_type='gain')
    gain_importance_list.append(gain_importance_df)


## Observe feature importance

In [None]:
def calc_mean_importance(importance_df_list):
    mean_importance = np.mean(
        np.array([df['importance'].values for df in importance_df_list]), axis=0)
    mean_df = importance_df_list[0].copy()
    mean_df['importance'] = mean_importance
    return mean_df

mean_gain_df = calc_mean_importance(gain_importance_list)
plot_importance(mean_gain_df, title='Model feature importance by gain')
mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
mean_gain_df.to_csv('gain_importance_mean.csv', index=False)

## Make prediction

In [None]:
targets = np.zeros(len(test))
for model in models:
    pred = model.predict(test, num_iteration = model.best_iteration)
    targets += pred/len(models)

In [None]:
sample_submission['loss'] = targets
sample_submission.to_csv('submission.csv', index = False)