In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install flaml
!pip install catboost

## 0. Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import sys
import seaborn as sns
import random as rn
import os
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV, cross_val_score, RepeatedKFold
from sklearn import metrics

from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.multioutput import MultiOutputRegressor

from collections import Counter
from flaml import AutoML

import warnings
%matplotlib inline
warnings.filterwarnings(action='ignore')

In [None]:
%cd '/content/drive/MyDrive/Autonomous_driving_antenna/'

## 1. Load dataset

In [None]:
# reproducibility
def set_seed(seed_num):
  # tf.random.set_seed(seed_num)
  np.random.seed(seed_num)
  rn.seed(seed_num)
  os.environ['PYTHONHASHSEED']=str(seed_num)

seed_num = 42
set_seed(seed_num)

In [None]:
train = pd.read_csv('dataset/train.csv')

train.head()

In [None]:
train_x_df = train.filter(regex='X')   # input
train_y_df = train.filter(regex='Y')   # output

In [None]:
train_x_df.head()

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,70.544,103.32,67.47,1,101.892,74.983,29.45,62.38,245.71,0.0,...,1,1,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256
1,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,0.0,...,1,1,10423.43,133.736691,135.979817,149.924692,123.630583,127.893337,143.322659,124.877308
2,72.583,103.32,64.07,1,103.153,72.943,28.81,105.77,272.2,0.0,...,1,1,10948.53,132.805112,131.055355,146.814592,128.93907,127.012195,140.395688,122.238232
3,71.563,103.32,67.57,1,101.971,77.022,28.92,115.21,255.36,0.0,...,1,1,15007.03,134.13876,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225
4,69.524,103.32,63.57,1,101.981,70.904,29.68,103.38,241.46,0.0,...,1,1,11051.03,142.72897,136.620022,134.853555,134.760252,125.647793,139.331105,123.272762


In [None]:
train_y_df.head()

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,2.056,1.456,1.68,10.502,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.47,-25.409,-25.304
1,1.446,1.184,1.268,18.507,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,1.251,0.665,0.782,14.082,31.801,17.08,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.37
3,1.464,1.079,1.052,16.975,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,0.983,0.646,0.689,15.047,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974


In [None]:
print(train_x_df.shape, train_y_df.shape)

(39607, 56) (39607, 14)


In [None]:
print(train_x_df.isnull().sum())

X_01    0
X_02    0
X_03    0
X_04    0
X_05    0
X_06    0
X_07    0
X_08    0
X_09    0
X_10    0
X_11    0
X_12    0
X_13    0
X_14    0
X_15    0
X_16    0
X_17    0
X_18    0
X_19    0
X_20    0
X_21    0
X_22    0
X_23    0
X_24    0
X_25    0
X_26    0
X_27    0
X_28    0
X_29    0
X_30    0
X_31    0
X_32    0
X_33    0
X_34    0
X_35    0
X_36    0
X_37    0
X_38    0
X_39    0
X_40    0
X_41    0
X_42    0
X_43    0
X_44    0
X_45    0
X_46    0
X_47    0
X_48    0
X_49    0
X_50    0
X_51    0
X_52    0
X_53    0
X_54    0
X_55    0
X_56    0
dtype: int64


In [None]:
print(train_y_df.isnull().sum())

Y_01    0
Y_02    0
Y_03    0
Y_04    0
Y_05    0
Y_06    0
Y_07    0
Y_08    0
Y_09    0
Y_10    0
Y_11    0
Y_12    0
Y_13    0
Y_14    0
dtype: int64


In [None]:
train_x = np.array(train_x_df)
train_y = np.array(train_y_df)

print(train_x.shape, train_y.shape)

(39607, 56) (39607, 14)


## 2. Modeling - fastAutoml

In [None]:
# MODEL_TIME_BUDGET = 60*5
# MODEL_METRIC = 'rmse'
# MODEL_TASK = "regression"
# MODEL_LIST = ["lgbm"]

# params = {
#     "time_budget": MODEL_TIME_BUDGET,  
#     "metric": MODEL_METRIC,
#     "estimator_list": MODEL_LIST, 
#     "task": MODEL_TASK,
#     "seed":seed_num,
# }

# auto_lgbm = MultiOutputRegressor(AutoML(**params))
# auto_lgbm.fit(train_x, train_y)

In [None]:
# MODEL_TIME_BUDGET = 60*5
# MODEL_METRIC = 'rmse'
# MODEL_TASK = "regression"
# MODEL_LIST = ["catboost"]

# params = {
#     "time_budget": MODEL_TIME_BUDGET,  
#     "metric": MODEL_METRIC,
#     "estimator_list": MODEL_LIST, 
#     "task": MODEL_TASK,
#     "seed":seed_num,
# }

# auto_cat = MultiOutputRegressor(AutoML(**params))
# auto_cat.fit(train_x, train_y)

In [None]:
# MODEL_TIME_BUDGET = 60*5
# MODEL_METRIC = 'rmse'
# MODEL_TASK = "regression"
# MODEL_LIST = ["xgboost"]

# params = {
#     "time_budget": MODEL_TIME_BUDGET,  
#     "metric": MODEL_METRIC,
#     "estimator_list": MODEL_LIST, 
#     "task": MODEL_TASK,
#     "seed":seed_num,
# }

# auto_xgb = MultiOutputRegressor(AutoML(**params))
# auto_xgb.fit(train_x, train_y)

In [None]:
# MODEL_TIME_BUDGET = 60*5
# MODEL_METRIC = 'rmse'
# MODEL_TASK = "regression"
# MODEL_LIST = ["rf"]

# params = {
#     "time_budget": MODEL_TIME_BUDGET,  
#     "metric": MODEL_METRIC,
#     "estimator_list": MODEL_LIST, 
#     "task": MODEL_TASK,
#     "seed":seed_num,
# }

# auto_rf = MultiOutputRegressor(AutoML(**params))
# auto_rf.fit(train_x, train_y)

In [None]:
# MODEL_TIME_BUDGET = 60*5
# MODEL_METRIC = 'rmse'
# MODEL_TASK = "regression"
# MODEL_LIST = ["extra_tree"]

# params = {
#     "time_budget": MODEL_TIME_BUDGET,  
#     "metric": MODEL_METRIC,
#     "estimator_list": MODEL_LIST, 
#     "task": MODEL_TASK,
#     "seed":seed_num,
# }

# auto_ext = MultiOutputRegressor(AutoML(**params))
# auto_ext.fit(train_x, train_y)

In [None]:
lightgbm = MultiOutputRegressor(LGBMRegressor(colsample_bytree=0.5263478696871312,
              learning_rate=0.01522625637146587, max_bin=1023,
              min_child_samples=23, n_estimators=2950, num_leaves=434,
              reg_alpha=0.013442380441791843, reg_lambda=0.0440062359548784, seed = seed_num,
              verbose=-1))

lightgbm.fit(train_x, train_y)

MultiOutputRegressor(estimator=LGBMRegressor(colsample_bytree=0.5263478696871312,
                                             learning_rate=0.01522625637146587,
                                             max_bin=1023, min_child_samples=23,
                                             n_estimators=2950, num_leaves=434,
                                             reg_alpha=0.013442380441791843,
                                             reg_lambda=0.0440062359548784,
                                             seed=42, verbose=-1))

In [None]:
xgboost = MultiOutputRegressor(XGBRegressor(colsample_bylevel=0.4192302659044743,
             colsample_bytree=0.7180716573534248, grow_policy='lossguide',
             learning_rate=0.013198731967539607, max_depth=0, max_leaves=140,
             min_child_weight=35.072279762160896, n_estimators=803, n_jobs=-1,
             reg_alpha=0.0009765625, reg_lambda=0.6342055478585719,
             subsample=0.827351358517848, tree_method='hist', seed = seed_num,
             use_label_encoder=False, verbosity=0))

xgboost.fit(train_x, train_y)

MultiOutputRegressor(estimator=XGBRegressor(colsample_bylevel=0.4192302659044743,
                                            colsample_bytree=0.7180716573534248,
                                            grow_policy='lossguide',
                                            learning_rate=0.013198731967539607,
                                            max_depth=0, max_leaves=140,
                                            min_child_weight=35.072279762160896,
                                            n_estimators=803, n_jobs=-1,
                                            reg_alpha=0.0009765625,
                                            reg_lambda=0.6342055478585719,
                                            seed=42,
                                            subsample=0.827351358517848,
                                            tree_method='hist',
                                            use_label_encoder=False,
                                            verbosity=0))

In [None]:
rf = MultiOutputRegressor(RandomForestRegressor(max_features=0.6347607006852164, max_leaf_nodes=1608,
                      n_estimators=136, n_jobs=-1, random_state = seed_num))

rf.fit(train_x, train_y)

MultiOutputRegressor(estimator=RandomForestRegressor(max_features=0.6347607006852164,
                                                     max_leaf_nodes=1608,
                                                     n_estimators=136,
                                                     n_jobs=-1,
                                                     random_state=42))

In [None]:
# ext = MultiOutputRegressor(ExtraTreesRegressor(max_features=0.5788805596313885, max_leaf_nodes=4210,
#                     n_estimators=318, n_jobs=-1, random_state = seed_num))

# ext.fit(train_x, train_y)

MultiOutputRegressor(estimator=ExtraTreesRegressor(max_features=0.5788805596313885,
                                                   max_leaf_nodes=4210,
                                                   n_estimators=318, n_jobs=-1,
                                                   random_state=42))

In [None]:
# stack_reg = MultiOutputRegressor(StackingCVRegressor(regressors=(lightgbm, xgboost, rf, ext),
#                                 meta_regressor = xgboost,
#                                 use_features_in_secondary=True))

# stack_reg.fit(train_x, train_y)

In [None]:
test_df = pd.read_csv('dataset/test.csv')

print(test_df.shape)
test_df.head()

(39608, 57)


Unnamed: 0,ID,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,TEST_00001,68.504,103.321,76.67,1,101.867,73.963,30.51,63.57,239.8,...,1,1,17227.63,138.130429,129.460682,141.50657,133.427229,129.711498,133.138096,121.859684
1,TEST_00002,67.485,103.32,69.37,1,101.992,67.845,28.03,116.99,189.23,...,1,1,17134.53,136.148839,128.266277,145.911745,131.196417,132.41148,133.629025,124.178623
2,TEST_00003,69.524,103.32,68.97,1,101.884,77.022,29.65,205.68,214.93,...,1,1,14860.83,120.447446,119.988804,132.099908,120.450155,130.051708,128.252972,114.475628
3,TEST_00004,69.524,103.32,65.87,1,101.866,73.963,28.15,103.38,180.8,...,1,1,15252.53,133.994695,125.06918,147.507669,123.142653,125.963665,139.666592,126.589253
4,TEST_00005,73.603,103.321,66.67,1,101.891,74.983,29.92,71.2,231.93,...,1,1,10752.23,137.918202,135.116192,138.600473,127.173033,137.252712,134.411335,124.020016


In [None]:
test_x = np.array(test_df.drop(columns=['ID']))

In [None]:
lgb_p = lightgbm.predict(test_x)
xgb_p = xgboost.predict(test_x)
rf_p = rf.predict(test_x)
ext_p = ext.predict(test_x)

In [None]:
prediction = (lgb_p + xgb_p  + rf_p + ext_p)/4
print(np.shape(prediction))

(39608, 14)


In [None]:
prediction2 = (lgb_p + xgb_p  + rf_p)/3
print(np.shape(prediction2))

(39608, 14)


In [None]:
prediction3 = (lgb_p + xgb_p)/2
print(np.shape(prediction3))

(39608, 14)


In [None]:
submit = pd.read_csv('dataset/sample_submission.csv')

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = prediction3[:,idx-1]

In [None]:
submit.to_csv('flaml_xgb_lgb.csv', index=False)