# Imports

In [13]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/hackerearth/texas_inst_hiring/price_of_iron/data/raw/train.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/hackerearth/texas_inst_hiring/price_of_iron/data/raw/test.csv'
sub_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/hackerearth/texas_inst_hiring/price_of_iron/data/raw/sample_submission.csv'

In [14]:
%%capture
!python3 -m pip install --quiet --upgrade sklearn
!python3 -m pip install --quiet --upgrade optuna
!python3 -m pip install --quiet --upgrade lightgbm

In [35]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option("precision", 4)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="darkgrid")

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor

import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback

SEED = 2311

In [16]:
import torch

if torch.cuda.is_available():
    GPU = True
else:
    GPU = False

In [17]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)
sub = pd.read_csv(sub_url)

# Data Summary

In [18]:
train.head()

Unnamed: 0,ID,RI,QA-check,Color-check,Purity-Level,Depth-percentage,Width-of-iron-top-width,Axis-x,Axis-y,Axis-z,Price-in-dollars
0,uid_36947,1.01,Premium,H,VS1,61.9,57.0,6.43,6.39,3.97,4779
1,uid_36834,0.41,Premium,E,VS1,59.8,61.0,4.85,4.79,2.88,1153
2,uid_39061,2.18,Premium,H,SI2,62.3,59.0,8.37,8.26,5.18,11579
3,uid_39213,0.71,Very Good,F,VS2,59.5,58.0,5.82,5.87,,2918
4,uid_15924,0.7,Ideal,E,SI2,61.0,55.0,5.76,,3.52,2332


In [45]:
test.head()

Unnamed: 0,ID,RI,QA-check,Color-check,Purity-Level,Depth-percentage,Width-of-iron-top-width,Axis-x,Axis-y,Axis-z
0,uid_38808,0.31,3.0,1.0,2.0,62.3,56.0,4.4,4.37,2.73
1,uid_36032,0.31,2.0,0.0,3.0,62.8,53.0,4.37,4.35,2.74
2,uid_28576,0.4,2.0,0.0,7.0,62.8,56.0,4.68,4.72,2.95
3,uid_51190,1.51,2.0,2.0,2.0,62.5,57.0,7.27,7.32,4.56
4,uid_10963,0.26,4.0,3.0,4.0,62.4,56.0,4.08,4.1,2.55


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       40455 non-null  object 
 1   RI                       40455 non-null  float64
 2   QA-check                 40455 non-null  object 
 3   Color-check              40455 non-null  object 
 4   Purity-Level             36606 non-null  object 
 5   Depth-percentage         40455 non-null  float64
 6   Width-of-iron-top-width  40455 non-null  float64
 7   Axis-x                   36614 non-null  float64
 8   Axis-y                   36612 non-null  float64
 9   Axis-z                   36625 non-null  float64
 10  Price-in-dollars         40455 non-null  int64  
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       13485 non-null  object 
 1   RI                       13485 non-null  float64
 2   QA-check                 13485 non-null  object 
 3   Color-check              13485 non-null  object 
 4   Purity-Level             12209 non-null  object 
 5   Depth-percentage         13485 non-null  float64
 6   Width-of-iron-top-width  13485 non-null  float64
 7   Axis-x                   13353 non-null  float64
 8   Axis-y                   11612 non-null  float64
 9   Axis-z                   13352 non-null  float64
dtypes: float64(6), object(4)
memory usage: 1.0+ MB


In [21]:
features = list(test.columns)
features.remove('ID')

cat_features = ['QA-check', 'Color-check', 'Purity-Level']
target = 'Price-in-dollars'

In [22]:
ord_encoder = OrdinalEncoder()
train[cat_features] = ord_encoder.fit_transform(train[cat_features])
test[cat_features] = ord_encoder.transform(test[cat_features])

# Hyperparameter tuning

In [23]:
base_params = {
    'n_estimators': 5000,
    'seed': SEED
}

if GPU:
    base_params['device_type'] = 'gpu'

In [28]:
def objective(trial, xtrain, ytrain, xval, yval, base_params):
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 50, 1000, step=25),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 10000, step=100),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 100, step=5),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 100, step=5),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 0.95, step=0.1),
        'bagging_freq': trial.suggest_categorical('bagging_freq', [1]),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.95, step=0.1),
    }

    model = LGBMRegressor(**base_params, **param_grid)
       
    model.fit(
        xtrain, ytrain,
        eval_set=[(xval, yval)],
        eval_metric='rmse',
        early_stopping_rounds=200,
        feature_name=features,
        categorical_feature=cat_features,
        callbacks=[
            LightGBMPruningCallback(trial, 'rmse')
      ] ,
        verbose=250
    )
        
    predictions = model.predict(xval)
    return mean_squared_error(yval, predictions, squared=False)
    

In [29]:
study = optuna.create_study(direction='minimize', 
                            sampler=TPESampler(), 
                            study_name='TI_Q12')

[32m[I 2022-01-30 15:52:51,863][0m A new study created in memory with name: TI_Q12[0m


In [30]:
xtrain, xval, ytrain, yval = train_test_split(train[features], train[target],
                                              test_size=0.2, shuffle=True,
                                              random_state=SEED)

In [None]:
study.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params),
    n_trials=50
)

In [32]:
print(f'Best score (RMSE): {study.best_value:.5f}')

best_params = study.best_params
print('Best params:')
for key, value in best_params.items():
    print(f'\t{key}: {value}')

Best score (RMSE): 3133.70864
Best params:
	learning_rate: 0.26581452208624495
	num_leaves: 775
	max_depth: 5
	min_data_in_leaf: 1500
	min_gain_to_split: 1.8428544441562709
	lambda_l1: 85.0
	lambda_l2: 25.0
	bagging_fraction: 0.6000000000000001
	bagging_freq: 1
	feature_fraction: 0.8


# Cross-validation + Inference

In [37]:
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

In [38]:
train['fold'] = -1
for fold, (_, val_idx) in enumerate(kf.split(X=train, y=train[target])):
  train.loc[val_idx, 'fold'] = fold

In [39]:
def custom_cross_val_predict(train, test, model, features, cat_features):
#   oof_preds = {}
  test_preds = []
  scores = []
    
  cv_start = time.time()
    
  for fold in range(N_SPLITS):
    print('-' * 40)
        
    xtrain = train[train.fold != fold].reset_index(drop=True)
    ytrain = xtrain[target]

    xval = train[train.fold == fold].reset_index(drop=True)
    yval = xval[target]
    # val_idx = xval.index.tolist()
    
    fold_start = time.time()
        
    model.fit(
        xtrain[features], ytrain,
        eval_set=[(xval[features], yval)],
        eval_metric='rmse',
        early_stopping_rounds=100,
        feature_name=features,
        categorical_feature=cat_features,
        verbose=250
    )
    
    val_preds = model.predict(xval[features])
    # oof_preds.update(dict(zip(val_idx, val_preds)))
    r2 = r2_score(yval, val_preds)
    scores.append(r2)
        
    fold_end = time.time()
        
    print(f'Fold #{fold}: R2_Score = {r2:.5f} \
    [Time: {fold_end - fold_start:.2f}s]')
        
    test_preds.append(model.predict(test[features]))
        
    cv_end = time.time()
  
  print(f'Average R2_Score = {np.mean(scores):.5f} \
  with std. dev. = {np.std(scores):.5f}')
  
  print(f'[Total time: {cv_end - cv_start:.2f}s]')
    
#   oof_preds = pd.DataFrame.from_dict(oof_preds, orient='index').reset_index()
  test_preds = np.mean(np.column_stack(test_preds), axis=1)
    
  return test_preds

In [40]:
model = LGBMRegressor(**base_params, **best_params)

In [42]:
test_preds = custom_cross_val_predict(train, test, model, features, cat_features)

----------------------------------------
Training until validation scores don't improve for 100 rounds.
[250]	valid_0's l2: 9.86346e+06	valid_0's rmse: 3140.62
Early stopping, best iteration is:
[235]	valid_0's l2: 9.85305e+06	valid_0's rmse: 3138.96
Fold #0: R2_Score = 0.60206     [Time: 1.80s]
----------------------------------------
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[149]	valid_0's l2: 9.4194e+06	valid_0's rmse: 3069.1
Fold #1: R2_Score = 0.61177     [Time: 1.42s]
----------------------------------------
Training until validation scores don't improve for 100 rounds.
[250]	valid_0's l2: 1.05521e+07	valid_0's rmse: 3248.41
Early stopping, best iteration is:
[276]	valid_0's l2: 1.05418e+07	valid_0's rmse: 3246.81
Fold #2: R2_Score = 0.58275     [Time: 2.01s]
----------------------------------------
Training until validation scores don't improve for 100 rounds.
[250]	valid_0's l2: 1.02724e+07	valid_0's rmse: 3205.06
Early s

In [44]:
sub = pd.DataFrame({'ID': test.ID, target: test_preds})
sub[target] = test_preds
sub.to_csv('sub.csv', index=False)

!head sub.csv

ID,Price-in-dollars
uid_38808,581.0077430502481
uid_36032,643.7372493098501
uid_28576,1634.43699531078
uid_51190,11890.049464269956
uid_10963,357.6624786222192
uid_36990,4245.000683763527
uid_29238,2967.5579088399127
uid_21955,12428.632171434707
uid_36295,4784.5875124810955
