In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import category_encoders as ce
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
train_df = import_data("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
train_df.head()

In [None]:
test_df = import_data("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
test_df.head()

In [None]:
train_df.drop(['id'],axis=1,inplace=True)
test_df.drop(['id'],axis=1,inplace=True)

In [None]:
def replace_outliers(data,feat):
    for col in feat:
        q1 = data[col].quantile(0.25)
        q3 = data[col].quantile(0.75)
        iqr = q3 - q1
        median_ = data[col].median()
        data.loc[((data[col] < q1 - 1.5*iqr) | (data[col] > q3 + 1.5*iqr)),col] = median_
    return data

In [None]:
cat_feat = [f'cat{x}' for x in range(0,10)]
features = [f'cont{x}' for x in range(0,14)]
print(cat_feat,features)

In [None]:
train = train_df
test = test_df

In [None]:
corr = train_df.corr()
plt.subplots(figsize=(14,10))
sns.heatmap(corr,vmax=0.9,cmap='viridis',square=True)

In [None]:
features_ = features + ['target']
train_df = replace_outliers(train_df,features_)
test_df = replace_outliers(test_df,features)

In [None]:
sns.set_style("dark")
sns.set_color_codes(palette = 'deep')
f,ax = plt.subplots(figsize = (9,8))

sns.distplot(train_df['target'],color='b')
ax.xaxis.grid(False)
ax.set(ylabel="Value")
ax.set(xlabel="Target")
ax.set(title="Target Distribution")
sns.despine(trim = True, left = True)
plt.show()

In [None]:
import optuna
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
train_new = train_df
test_new = test_df

In [None]:
for e in cat_feat:
    le = LabelEncoder()
    train_new[e]=le.fit_transform(train_new[e])
    test_new[e]=le.transform(test_new[e])

In [None]:
data = train_new[cat_feat + features]
target = train_new['target']

In [None]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=33)
    param = {
        'metric': 'rmse', 
        'random_state': 33,
        'n_estimators': 20000,
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
params = study.best_params   
params['random_state'] = 33
params['n_estimators'] = 20000 
params['metric'] = 'rmse'
params['cat_smooth'] = params.pop('min_data_per_groups')
print(params)

# {'reg_alpha': 0.001954724361880276, 'reg_lambda': 0.7268234402385888, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.006, 'max_depth': 10, 'num_leaves': 65, 'min_child_samples': 217, 'random_state': 48, 'n_estimators': 20000, 'metric': 'rmse', 'cat_smooth': 41}

In [None]:
params = {'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 0.0005,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
 'random_state': 33,
 'n_estimators': 320000,
 'metric': 'rmse',
 'cat_smooth': 39}

In [None]:
columns = cat_feat + features
preds = np.zeros(test.shape[0])
kf = KFold(n_splits = 10,random_state=33,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val = train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val = train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = LGBMRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=800,verbose=False)
    preds += model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n += 1


# 1 0.8361832715654893
# 2 0.8379744971037101
# 3 0.8356927176770227
# 4 0.8399920239400532
# 5 0.8398865061968892

In [None]:
output = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
np.mean(rmse)

In [None]:
from optuna.integration import lightgbm as lgb
lgb.plot_importance(model, max_num_features=10, figsize=(10,10))
plt.show()

In [None]:
i = 7
output['target'] = preds
output.to_csv(f'submission_{i}' + '.csv', index=False)