In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
import numpy as np
pd.set_option('display.max_columns', 700)

In [6]:
df = pd.read_csv('TRAIN_1.csv')
df['order_date'] = pd.to_datetime(df['order_date'], format = '%m/%d/%y')
df['evsd'] = pd.to_datetime(df['evsd'], format = '%m/%d/%y')
df['tag'] = 'train'

print(df.shape)
df.head()

(259287, 12)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7,2,4,4,4,train
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4,1,20,20,20,train
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4,4,4,4,4,train
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5,2,4,4,4,train
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6,1,4,4,4,train


In [7]:
df_test = pd.read_csv('TEST_2.csv', encoding = 'latin1')
df_test.columns = ['LOCATION', 'd_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd',
       'row_group_ind', 'visibility', 'quantity_ordered', 'quantity_submitted',
       'quantity_received']
df_test['order_date'] = pd.to_datetime(df_test['order_date'], format = '%m/%d/%y')
df_test['evsd'] = pd.to_datetime(df_test['evsd'], format = '%m/%d/%y')
df_test['tag'] = 'test'
print(df_test.shape)
df_test.head()

(98310, 12)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
0,JFK2,648.0,27561.0,146126.0,2022-12-19,2022-12-21,3.0,2.0,8.0,8.0,8,test
1,JFK2,584.0,23965.0,23086.0,2022-12-23,2022-12-26,2.0,1.0,0.0,6.0,0,test
2,JFK2,639.0,7929.0,67933.0,2022-12-14,2022-12-16,1.0,1.0,16.0,16.0,16,test
3,JFK2,410.0,18868.0,35651.0,2022-12-24,2022-12-31,0.0,6.0,10.0,10.0,10,test
4,JFK2,509.0,31141.0,65889.0,2022-12-20,2022-12-22,7.0,2.0,12.0,12.0,12,test


In [5]:
df = pd.concat([df_train, df_test]).reset_index(drop = True)
print(df.shape)
df.head()

(357597, 12)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
0,JFK2,1.0,25556.0,3912.0,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train
1,JFK2,1.0,25556.0,91302.0,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train
2,JFK2,1.0,33083.0,26096.0,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train
3,JFK2,1.0,33083.0,138855.0,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train
4,JFK2,1.0,33083.0,114502.0,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train


In [6]:
df.isnull().sum()

LOCATION              1
d_id                  1
isbn_id               1
order_id_1            1
order_date            1
evsd                  1
row_group_ind         1
visibility            1
quantity_ordered      1
quantity_submitted    1
quantity_received     0
tag                   0
dtype: int64

In [7]:
df[df['LOCATION'].isnull()]

Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
357596,,,,,NaT,NaT,,,,,1369032,test


In [8]:
df = df[df['LOCATION'].notnull()].reset_index(drop = True)

In [9]:
# Unique Values in each column (Cardinality)
df.nunique()

LOCATION                 1
d_id                    38
isbn_id               9059
order_id_1            1414
order_date              79
evsd                    81
row_group_ind            8
visibility              25
quantity_ordered       205
quantity_submitted     205
quantity_received      231
tag                      1
dtype: int64

In [10]:
# Data Types
df.dtypes

LOCATION                      object
d_id                           int64
isbn_id                        int64
order_id_1                     int64
order_date            datetime64[ns]
evsd                  datetime64[ns]
row_group_ind                  int64
visibility                     int64
quantity_ordered               int64
quantity_submitted             int64
quantity_received              int64
tag                           object
dtype: object

### Observation:
- Change d_id, isbn_id and order_id_1 to string as they are not numerical variables.

In [11]:
for col in ['d_id', 'isbn_id', 'order_id_1']:
  df[col] = df[col].astype(int).astype(str)

print(df.shape)
df.head(2)

(259287, 12)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7,2,4,4,4,train
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4,1,20,20,20,train


# EDA

In [12]:
df['order_date'].describe()

count                           259287
mean     2022-10-26 14:10:23.540709376
min                2022-09-05 00:00:00
25%                2022-10-09 00:00:00
50%                2022-10-26 00:00:00
75%                2022-11-14 00:00:00
max                2022-11-30 00:00:00
Name: order_date, dtype: object

In [13]:

df['evsd'].describe()

count                           259287
mean     2022-10-30 17:27:44.899513088
min                2022-09-27 00:00:00
25%                2022-10-13 00:00:00
50%                2022-10-31 00:00:00
75%                2022-11-17 00:00:00
max                2022-12-20 00:00:00
Name: evsd, dtype: object

### Observation:
- Data is for 6M orders where order-date from 5th sept 2022 till 31st Dec 2022. Whereas deliver date ranges from 27th Sept 2022.

In [14]:
# How long does order usually take to deliver?
df['order_delivery_days'] = (df['evsd'] - df['order_date']).dt.days

In [15]:
# fig = px.histogram(df, x='d_id')
# fig.show();

In [16]:
# fig = px.box(df, x='d_id', y='order_delivery_days', color_discrete_sequence=['#1f77b4'])
# fig.show();


In [17]:

# fig = px.bar(df, x='d_id', y='order_delivery_days')
# fig.show();

In [18]:
# pio.write_html(fig, file='BarPlot_d_id_Delivery_days.html', auto_open=True)

In [19]:
df.columns

Index(['LOCATION', 'd_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd',
       'row_group_ind', 'visibility', 'quantity_ordered', 'quantity_submitted',
       'quantity_received', 'tag', 'order_delivery_days'],
      dtype='object')

In [20]:
df['order_year'] = df['order_date'].dt.year
df['order_month'] = df['order_date'].dt.month
df['order_day'] = df['order_date'].dt.day
df['order_dayofweek'] = df['order_date'].dt.dayofweek
df['order_dayofyear'] = df['order_date'].dt.dayofyear

df['delivered_year'] = df['evsd'].dt.year
df['delivered_month'] = df['evsd'].dt.month
df['delivered_day'] = df['evsd'].dt.day
df['delivered_dayofweek'] = df['evsd'].dt.dayofweek
df['delivered_dayofyear'] = df['evsd'].dt.dayofyear


In [21]:
df['order_month_sin'] = np.sin(2 * np.pi * df['order_month'] / 12)
df['order_month_cos'] = np.cos(2 * np.pi * df['order_month'] / 12)

df['order_dayofweek_sin'] = np.sin(2 * np.pi * df['order_dayofweek'] / 7)
df['order_dayofweek_cos'] = np.cos(2 * np.pi * df['order_dayofweek'] / 7)

df['delivered_month_sin'] = np.sin(2 * np.pi * df['delivered_month'] / 12)
df['delivered_month_cos'] = np.cos(2 * np.pi * df['delivered_month'] / 12)

df['delivered_dayofweek_sin'] = np.sin(2 * np.pi * df['delivered_dayofweek'] / 7)
df['delivered_dayofweek_cos'] = np.cos(2 * np.pi * df['delivered_dayofweek'] / 7)

df.head()

Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag,order_delivery_days,order_year,order_month,order_day,order_dayofweek,order_dayofyear,delivered_year,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train,4,2022,9,30,4,273,2022,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train,3,2022,11,16,2,320,2022,11,19,5,323,-0.5,0.8660254,0.974928,-0.222521,-0.5,0.866025,-0.974928,-0.222521
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train,4,2022,10,7,4,280,2022,10,11,1,284,-0.866025,0.5,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train,3,2022,10,19,2,292,2022,10,22,5,295,-0.866025,0.5,0.974928,-0.222521,-0.866025,0.5,-0.974928,-0.222521
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train,3,2022,10,18,1,291,2022,10,21,4,294,-0.866025,0.5,0.781831,0.62349,-0.866025,0.5,-0.433884,-0.900969


In [22]:
# As categorical variable
df['order_month_season'] = df['order_date'].dt.month.map({1: 'winter', 2: 'winter', 3: 'spring', 4: 'spring', 5: 'spring', 6: 'summer', 7: 'summer', 8: 'summer', 9: 'fall', 10: 'fall', 11: 'fall', 12: 'winter'})
df['delivered_month_season'] = df['evsd'].dt.month.map({1: 'winter', 2: 'winter', 3: 'spring', 4: 'spring', 5: 'spring', 6: 'summer', 7: 'summer', 8: 'summer', 9: 'fall', 10: 'fall', 11: 'fall', 12: 'winter'})

# As continuous variable using sine and cosine transformations
df['order_month_sin'] = np.sin(2 * np.pi * df['order_month'] / 12)
df['order_month_cos'] = np.cos(2 * np.pi * df['order_month'] / 12)
df['delivered_month_sin'] = np.sin(2 * np.pi * df['delivered_month'] / 12)
df['delivered_month_cos'] = np.cos(2 * np.pi * df['delivered_month'] / 12)
df.head()

Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag,order_delivery_days,order_year,order_month,order_day,order_dayofweek,order_dayofyear,delivered_year,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos,order_month_season,delivered_month_season
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train,4,2022,9,30,4,273,2022,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train,3,2022,11,16,2,320,2022,11,19,5,323,-0.5,0.8660254,0.974928,-0.222521,-0.5,0.866025,-0.974928,-0.222521,fall,fall
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train,4,2022,10,7,4,280,2022,10,11,1,284,-0.866025,0.5,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train,3,2022,10,19,2,292,2022,10,22,5,295,-0.866025,0.5,0.974928,-0.222521,-0.866025,0.5,-0.974928,-0.222521,fall,fall
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train,3,2022,10,18,1,291,2022,10,21,4,294,-0.866025,0.5,0.781831,0.62349,-0.866025,0.5,-0.433884,-0.900969,fall,fall


In [23]:
df['order_weekday'] = (df['order_dayofweek'] < 5).astype(int)  # 0 for weekend, 1 for weekday
df['delivered_weekday'] = (df['delivered_dayofweek'] < 5).astype(int)


In [15]:
df.shape

(259287, 12)

In [16]:
df.head(1)

Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7,2,4,4,4,train


In [54]:
(df['d_id'] +'__' + df['isbn_id']).value_counts(dropna = False, normalize = True).head(50)
# Remaining you put as unknown


540__4595     0.001158
524__26550    0.001124
410__24163    0.001119
540__373      0.001116
540__14820    0.001116
410__29416    0.001116
524__7865     0.001102
410__10204    0.001096
524__29771    0.001077
410__19394    0.001077
410__1528     0.001077
524__8453     0.001071
410__28311    0.001057
410__2081     0.001054
410__28446    0.001054
410__13646    0.001054
524__27825    0.001051
524__21625    0.001049
410__26105    0.001043
540__31986    0.001032
524__6790     0.001026
524__19934    0.001024
410__32365    0.001018
410__8551     0.001018
410__17481    0.001012
410__30418    0.000998
410__5310     0.000998
410__31291    0.000979
410__12868    0.000976
410__28595    0.000973
410__1108     0.000970
410__6998     0.000956
410__4002     0.000954
540__29918    0.000948
540__24090    0.000948
524__30821    0.000948
540__15727    0.000948
540__9338     0.000948
540__2044     0.000942
540__15592    0.000937
410__30967    0.000937
410__20003    0.000937
524__20309    0.000937
540__23917 

In [53]:
df['isbn_id'].value_counts(dropna = False, normalize = True).head(10)

isbn_id
4595     0.001158
26550    0.001124
24163    0.001119
373      0.001116
14820    0.001116
29416    0.001116
7865     0.001102
10204    0.001096
29771    0.001077
19394    0.001077
Name: proportion, dtype: float64

In [51]:
df['d_id'].value_counts(dropna = False, normalize = True)

d_id
509    0.382574
584    0.123184
540    0.094204
648    0.060823
7      0.047584
410    0.045951
1      0.042819
643    0.021091
684    0.019609
454    0.019226
110    0.017995
639    0.015613
174    0.014419
524    0.014391
652    0.013622
18     0.008901
136    0.008266
393    0.007525
566    0.007192
472    0.006689
763    0.006502
343    0.005271
218    0.004899
711    0.003591
13     0.001913
302    0.001326
373    0.001270
830    0.001054
440    0.000453
79     0.000419
613    0.000411
216    0.000313
845    0.000277
701    0.000235
127    0.000204
746    0.000067
803    0.000056
462    0.000039
210    0.000017
241    0.000006
Name: proportion, dtype: float64

In [25]:
df.nunique()


LOCATION                      1
d_id                         40
isbn_id                    9492
order_id_1                 1993
order_date                  110
evsd                        106
row_group_ind                 8
visibility                   25
quantity_ordered            218
quantity_submitted          230
quantity_received           264
tag                           2
order_delivery_days          23
order_year                    1
order_month                   4
order_day                    31
order_dayofweek               7
order_dayofyear             110
delivered_year                2
delivered_month               5
delivered_day                31
delivered_dayofweek           7
delivered_dayofyear         106
order_month_sin               4
order_month_cos               4
order_dayofweek_sin           7
order_dayofweek_cos           7
delivered_month_sin           5
delivered_month_cos           5
delivered_dayofweek_sin       7
delivered_dayofweek_cos       7
order_mo

In [26]:
# Create a new column that combines the values of the existing columns
df['unique_id'] = df.apply(lambda row: f"{row['LOCATION']}_{row['d_id']}_{row['isbn_id']}_{row['row_group_ind']}_{row['order_id_1']}", axis=1)

# Drop duplicates based on the new column
df_unique = df.drop_duplicates(subset=['unique_id'])

df_unique.shape, df.shape

((305203, 36), (357596, 36))

In [27]:
print(df_unique.shape)
df_unique.head()

(305203, 36)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag,order_delivery_days,order_year,order_month,order_day,order_dayofweek,order_dayofyear,delivered_year,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos,order_month_season,delivered_month_season,order_weekday,delivered_weekday,unique_id
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train,4,2022,9,30,4,273,2022,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_25556_7.0_3912
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train,3,2022,11,16,2,320,2022,11,19,5,323,-0.5,0.8660254,0.974928,-0.222521,-0.5,0.866025,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_25556_4.0_91302
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train,4,2022,10,7,4,280,2022,10,11,1,284,-0.866025,0.5,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_33083_4.0_26096
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train,3,2022,10,19,2,292,2022,10,22,5,295,-0.866025,0.5,0.974928,-0.222521,-0.866025,0.5,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_33083_5.0_138855
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train,3,2022,10,18,1,291,2022,10,21,4,294,-0.866025,0.5,0.781831,0.62349,-0.866025,0.5,-0.433884,-0.900969,fall,fall,1,1,JFK2_1_33083_6.0_114502


In [28]:
df.columns

Index(['LOCATION', 'd_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd',
       'row_group_ind', 'visibility', 'quantity_ordered', 'quantity_submitted',
       'quantity_received', 'tag', 'order_delivery_days', 'order_year',
       'order_month', 'order_day', 'order_dayofweek', 'order_dayofyear',
       'delivered_year', 'delivered_month', 'delivered_day',
       'delivered_dayofweek', 'delivered_dayofyear', 'order_month_sin',
       'order_month_cos', 'order_dayofweek_sin', 'order_dayofweek_cos',
       'delivered_month_sin', 'delivered_month_cos', 'delivered_dayofweek_sin',
       'delivered_dayofweek_cos', 'order_month_season',
       'delivered_month_season', 'order_weekday', 'delivered_weekday',
       'unique_id'],
      dtype='object')

In [29]:
df[['d_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd', 'row_group_ind']].dtypes

d_id                     object
isbn_id                  object
order_id_1               object
order_date       datetime64[ns]
evsd             datetime64[ns]
row_group_ind           float64
dtype: object

In [30]:
df['row_group_ind'] = df['row_group_ind'].astype(str)

In [31]:
df.columns

Index(['LOCATION', 'd_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd',
       'row_group_ind', 'visibility', 'quantity_ordered', 'quantity_submitted',
       'quantity_received', 'tag', 'order_delivery_days', 'order_year',
       'order_month', 'order_day', 'order_dayofweek', 'order_dayofyear',
       'delivered_year', 'delivered_month', 'delivered_day',
       'delivered_dayofweek', 'delivered_dayofyear', 'order_month_sin',
       'order_month_cos', 'order_dayofweek_sin', 'order_dayofweek_cos',
       'delivered_month_sin', 'delivered_month_cos', 'delivered_dayofweek_sin',
       'delivered_dayofweek_cos', 'order_month_season',
       'delivered_month_season', 'order_weekday', 'delivered_weekday',
       'unique_id'],
      dtype='object')

In [32]:
for col in ['d_id', 'isbn_id', 'order_id_1', 'order_date', 'evsd', 'row_group_ind', 'order_month_season', 'delivered_month_season']:
    temp_dict = dict(df[col].value_counts(normalize = True))
    df[f'{col}_VC'] = df[col].map(temp_dict)
    del temp_dict

In [33]:
req_cols = [col for col in df.columns if col not in ['LOCATION', 'order_date', 'evsd', 'd_id', 
                                                     'isbn_id', 'order_id_1','quantity_received', 'tag',
                                                      'order_date', 'evsd', 'row_group_ind', 'unique_id','target', 
                                                      'delivered_year', 'order_year',  'order_month_season', 'delivered_month_season']]

In [34]:
df[req_cols].head(1)

Unnamed: 0,visibility,quantity_ordered,quantity_submitted,order_delivery_days,order_month,order_day,order_dayofweek,order_dayofyear,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos,order_weekday,delivered_weekday,d_id_VC,isbn_id_VC,order_id_1_VC,order_date_VC,evsd_VC,row_group_ind_VC,order_month_season_VC,delivered_month_season_VC
0,2.0,4.0,4.0,4,9,30,4,273,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,1,1,0.042819,0.000506,0.000895,0.009318,0.013834,0.124056,0.725084,0.680735


In [35]:
df['order_month_season'].value_counts(dropna = False)

order_month_season
fall      259287
winter     98309
Name: count, dtype: int64

In [36]:
df.head()

Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag,order_delivery_days,order_year,order_month,order_day,order_dayofweek,order_dayofyear,delivered_year,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos,order_month_season,delivered_month_season,order_weekday,delivered_weekday,unique_id,d_id_VC,isbn_id_VC,order_id_1_VC,order_date_VC,evsd_VC,row_group_ind_VC,order_month_season_VC,delivered_month_season_VC
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train,4,2022,9,30,4,273,2022,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_25556_7.0_3912,0.042819,0.000506,0.000895,0.009318,0.013834,0.124056,0.725084,0.680735
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train,3,2022,11,16,2,320,2022,11,19,5,323,-0.5,0.8660254,0.974928,-0.222521,-0.5,0.866025,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_25556_4.0_91302,0.042819,0.000506,0.000604,0.009019,0.011457,0.12546,0.725084,0.680735
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train,4,2022,10,7,4,280,2022,10,11,1,284,-0.866025,0.5,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_33083_4.0_26096,0.042819,0.00035,0.00094,0.008739,0.014061,0.12546,0.725084,0.680735
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train,3,2022,10,19,2,292,2022,10,22,5,295,-0.866025,0.5,0.974928,-0.222521,-0.866025,0.5,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_33083_5.0_138855,0.042819,0.00035,0.000738,0.010571,0.01198,0.125169,0.725084,0.680735
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train,3,2022,10,18,1,291,2022,10,21,4,294,-0.866025,0.5,0.781831,0.62349,-0.866025,0.5,-0.433884,-0.900969,fall,fall,1,1,JFK2_1_33083_6.0_114502,0.042819,0.00035,0.000721,0.0121,0.013048,0.124764,0.725084,0.680735


In [78]:
from sklearn.preprocessing import QuantileTransformer

# Instantiate the transformer
qt = QuantileTransformer(output_distribution='uniform')



In [88]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb

# Split data into features and target
X = df[req_cols]
y = df['quantity_received']

# Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = df[df['tag'] == 'train'][req_cols].copy()
X_test = df[df['tag'] == 'test'][req_cols].copy()
y_train = df[df['tag'] == 'train']['quantity_received']
y_test = df[df['tag'] == 'test']['quantity_received']


X_train = qt.fit_transform(X_train)
X_test = qt.transform(X_test)

# Train Poisson regression model
# Define XGBoost model parameters
params = {'learning_rate': 0.09494445071749025, 'reg_lambda': 0.12153018654604152, 'reg_alpha': 0.002926293135127233, 
          'subsample': 0.4759168036681104, 'colsample_bytree': 0.23668139727995788, 'max_depth': 5, 
          'n_estimators': 926}


# {
#     'objective': 'reg:squarederror',
#     'learning_rate': 0.1,
#     'max_depth': 6,
#     'min_child_weight': 1,
#     'gamma': 0,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
#     'n_estimators': 100
# }
model = xgb.XGBRegressor(**params, 
            random_state=42,
            objective='reg:squarederror',
            eval_metric='mae')
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance on test set
mae = mean_absolute_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print(f'Mean absolute error: {mae:.4f}')
print(f'Score: {score:.4f}')


Mean absolute error: 4.3982
Score: 0.6886


## Score all Data

In [89]:
df_scored = df.copy()
print(df_scored.shape)
df_scored.head()

(305203, 44)


Unnamed: 0,LOCATION,d_id,isbn_id,order_id_1,order_date,evsd,row_group_ind,visibility,quantity_ordered,quantity_submitted,quantity_received,tag,order_delivery_days,order_year,order_month,order_day,order_dayofweek,order_dayofyear,delivered_year,delivered_month,delivered_day,delivered_dayofweek,delivered_dayofyear,order_month_sin,order_month_cos,order_dayofweek_sin,order_dayofweek_cos,delivered_month_sin,delivered_month_cos,delivered_dayofweek_sin,delivered_dayofweek_cos,order_month_season,delivered_month_season,order_weekday,delivered_weekday,unique_id,d_id_VC,isbn_id_VC,order_id_1_VC,order_date_VC,evsd_VC,row_group_ind_VC,order_month_season_VC,delivered_month_season_VC
0,JFK2,1,25556,3912,2022-09-30,2022-10-04,7.0,2.0,4.0,4.0,4,train,4,2022,9,30,4,273,2022,10,4,1,277,-1.0,-1.83697e-16,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_25556_7.0_3912,0.043394,0.000524,0.000888,0.009174,0.014128,0.124416,0.72227,0.68366
1,JFK2,1,25556,91302,2022-11-16,2022-11-19,4.0,1.0,20.0,20.0,20,train,3,2022,11,16,2,320,2022,11,19,5,323,-0.5,0.8660254,0.974928,-0.222521,-0.5,0.866025,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_25556_4.0_91302,0.043394,0.000524,0.000636,0.008424,0.01192,0.125425,0.72227,0.68366
2,JFK2,1,33083,26096,2022-10-07,2022-10-11,4.0,4.0,4.0,4.0,4,train,4,2022,10,7,4,280,2022,10,11,1,284,-0.866025,0.5,-0.433884,-0.900969,-0.866025,0.5,0.781831,0.62349,fall,fall,1,1,JFK2_1_33083_4.0_26096,0.043394,0.000344,0.000914,0.008598,0.01463,0.125425,0.72227,0.68366
3,JFK2,1,33083,138855,2022-10-19,2022-10-22,5.0,2.0,4.0,4.0,4,train,3,2022,10,19,2,292,2022,10,22,5,295,-0.866025,0.5,0.974928,-0.222521,-0.866025,0.5,-0.974928,-0.222521,fall,fall,1,0,JFK2_1_33083_5.0_138855,0.043394,0.000344,0.000763,0.009852,0.012549,0.124933,0.72227,0.68366
4,JFK2,1,33083,114502,2022-10-18,2022-10-21,6.0,1.0,4.0,4.0,4,train,3,2022,10,18,1,291,2022,10,21,4,294,-0.866025,0.5,0.781831,0.62349,-0.866025,0.5,-0.433884,-0.900969,fall,fall,1,1,JFK2_1_33083_6.0_114502,0.043394,0.000344,0.000754,0.012706,0.012988,0.124766,0.72227,0.68366


In [40]:
# df['order_year'].value_counts(dropna = False)


"""
MAE : < 2.5
"""


[['year', 'month', 'day', 'delivered_dayofyear', ]]

# [['order_dayofweek']]

order_year
2022    357596
Name: count, dtype: int64

In [39]:
dict(df['row_group_ind'].value_counts(dropna= False, normalize = True))

{'1.0': 0.12554111343527333,
 '4.0': 0.12546001633127887,
 '2.0': 0.12540129084217944,
 '5.0': 0.1251691853376436,
 '3.0': 0.1249678408035884,
 '6.0': 0.12476369981767134,
 '0.0': 0.12464065593574872,
 '7.0': 0.1240561974966163}

In [None]:
df_new['row_group_ind'].map()

In [90]:
df_scored[req_cols]  = qt.transform(df_scored[req_cols])
df_scored['predicted_orders'] = model.predict(df_scored[req_cols])
df_scored[['quantity_received', 'predicted_orders']]

Unnamed: 0,quantity_received,predicted_orders
0,4,3.831436
1,20,19.770668
2,4,2.959369
3,4,3.552727
4,4,3.674586
...,...,...
357590,24,21.808784
357591,8,7.168389
357592,12,10.871333
357593,132,101.021225


In [91]:
df_scored['quantity_received'].describe()

count    305203.000000
mean         12.700278
std          27.046207
min           0.000000
25%           0.000000
50%           6.000000
75%          12.000000
max        1650.000000
Name: quantity_received, dtype: float64

In [92]:
df_scored['predicted_orders'].describe()

count    305203.000000
mean         12.463552
std          24.910221
min         -30.389456
25%           2.961089
50%           6.903901
75%          11.111802
max         773.371277
Name: predicted_orders, dtype: float64

## Hypertuning


In [96]:
"""Uncomment for tuning the model"""
import optuna
def run(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 0.1)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 6)
    n_estimators = trial.suggest_int("n_estimators", 20, 3000)


    model = xgb.XGBRegressor(
            random_state=42,
            objective='reg:squarederror',
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            eval_metric='mae'
        )
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_test)
    mae_ = mean_absolute_error(y_test, preds_valid)
    r2_sc = r2_score(y_test, preds_valid)
    print("Mean Absolute Error : ", mae_)
    print("R2_score : ", r2_sc)
    return mae_

study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=20)

# study.best_params

[32m[I 2023-04-28 08:27:37,198][0m A new study created in memory with name: no-name-6633f2c9-b973-4c05-879d-a6a88d4adc0f[0m
  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:30:51,858][0m Trial 0 finished with value: 5.30203999121291 and parameters: {'learning_rate': 0.03309359543919969, 'reg_lambda': 0.2159147341660498, 'reg_alpha': 0.06916918029376977, 'subsample': 0.5038536182347256, 'colsample_bytree': 0.3266281397228815, 'max_depth': 5, 'n_estimators': 2820}. Best is trial 0 with value: 5.30203999121291.[0m


Mean Absolute Error :  5.30203999121291
R2_score :  0.7478631424634002


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:31:44,537][0m Trial 1 finished with value: 6.824564105550033 and parameters: {'learning_rate': 0.09494445071749025, 'reg_lambda': 0.12153018654604152, 'reg_alpha': 0.002926293135127233, 'subsample': 0.4759168036681104, 'colsample_bytree': 0.23668139727995788, 'max_depth': 5, 'n_estimators': 926}. Best is trial 0 with value: 5.30203999121291.[0m


Mean Absolute Error :  6.824564105550033
R2_score :  0.7252528819371045


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:32:14,441][0m Trial 2 finished with value: 5.085238951615853 and parameters: {'learning_rate': 0.08539001075999679, 'reg_lambda': 0.0017116086819376096, 'reg_alpha': 9.733517229448778e-07, 'subsample': 0.8486752491235564, 'colsample_bytree': 0.765683289497556, 'max_depth': 6, 'n_estimators': 302}. Best is trial 2 with value: 5.085238951615853.[0m


Mean Absolute Error :  5.085238951615853
R2_score :  0.6582640974257273


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:34:36,542][0m Trial 3 finished with value: 4.316466609593959 and parameters: {'learning_rate': 0.06002410622268421, 'reg_lambda': 0.9102659582342552, 'reg_alpha': 0.00019889251192707353, 'subsample': 0.3447540708103442, 'colsample_bytree': 0.644771781552105, 'max_depth': 3, 'n_estimators': 2362}. Best is trial 3 with value: 4.316466609593959.[0m


Mean Absolute Error :  4.316466609593959
R2_score :  0.7345140742866197


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:36:29,075][0m Trial 4 finished with value: 4.014516335495757 and parameters: {'learning_rate': 0.029709652164319054, 'reg_lambda': 6.140086141171898, 'reg_alpha': 0.019604127340872607, 'subsample': 0.25246008848760754, 'colsample_bytree': 0.7080422852270917, 'max_depth': 4, 'n_estimators': 1433}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.014516335495757
R2_score :  0.7506425492580595


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:41:06,670][0m Trial 5 finished with value: 4.8431481285810944 and parameters: {'learning_rate': 0.06534568249655477, 'reg_lambda': 1.3336415440754149e-05, 'reg_alpha': 0.000345415372188516, 'subsample': 0.844978725545569, 'colsample_bytree': 0.44989422418684, 'max_depth': 6, 'n_estimators': 2945}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.8431481285810944
R2_score :  0.7186509266666582


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:43:14,007][0m Trial 6 finished with value: 4.817154102344354 and parameters: {'learning_rate': 0.05297263382164796, 'reg_lambda': 0.0009063555368964637, 'reg_alpha': 1.2798673223114063e-08, 'subsample': 0.35112507992555586, 'colsample_bytree': 0.6116591007571499, 'max_depth': 3, 'n_estimators': 1879}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.817154102344354
R2_score :  0.7055969828412461


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:47:33,274][0m Trial 7 finished with value: 4.788554937946665 and parameters: {'learning_rate': 0.017752000591089934, 'reg_lambda': 2.710613584367079e-06, 'reg_alpha': 2.4821573039019524, 'subsample': 0.29862823089860413, 'colsample_bytree': 0.3345167048863346, 'max_depth': 6, 'n_estimators': 2080}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.788554937946665
R2_score :  0.7591261342754274


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:49:55,215][0m Trial 8 finished with value: 4.065070158728204 and parameters: {'learning_rate': 0.029324035728187696, 'reg_lambda': 0.6438178323500426, 'reg_alpha': 0.0821740082898111, 'subsample': 0.14134871330258492, 'colsample_bytree': 0.8295613348742853, 'max_depth': 3, 'n_estimators': 2301}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.065070158728204
R2_score :  0.7233241158947401


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:50:37,802][0m Trial 9 finished with value: 6.206847562617861 and parameters: {'learning_rate': 0.077082421202773, 'reg_lambda': 0.053731762413557996, 'reg_alpha': 0.008959806512862106, 'subsample': 0.12998078892517306, 'colsample_bytree': 0.1113762572136053, 'max_depth': 6, 'n_estimators': 898}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  6.206847562617861
R2_score :  0.7392738976386494


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:52:23,091][0m Trial 10 finished with value: 5.448784966336275 and parameters: {'learning_rate': 0.001683717177778158, 'reg_lambda': 55.502350347799705, 'reg_alpha': 49.42282137256974, 'subsample': 0.6723973732262163, 'colsample_bytree': 0.9949414623192445, 'max_depth': 4, 'n_estimators': 1275}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  5.448784966336275
R2_score :  0.6797171374340709


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:54:08,304][0m Trial 11 finished with value: 4.085750954056798 and parameters: {'learning_rate': 0.03542790877442886, 'reg_lambda': 41.25654627199507, 'reg_alpha': 0.4560511887435661, 'subsample': 0.147532038106214, 'colsample_bytree': 0.815671183841725, 'max_depth': 4, 'n_estimators': 1836}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.085750954056798
R2_score :  0.7594484502850081


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:55:09,808][0m Trial 12 finished with value: 4.254570276865067 and parameters: {'learning_rate': 0.03513889075361243, 'reg_lambda': 5.924322568326093, 'reg_alpha': 0.1128754407699121, 'subsample': 0.10981470118206949, 'colsample_bytree': 0.7705293464848719, 'max_depth': 3, 'n_estimators': 1440}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.254570276865067
R2_score :  0.7572186240389522


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:57:49,328][0m Trial 13 finished with value: 4.791194169543509 and parameters: {'learning_rate': 0.025010535957519203, 'reg_lambda': 6.824506512475626e-08, 'reg_alpha': 10.570979534990876, 'subsample': 0.22461807089542404, 'colsample_bytree': 0.9158758279999428, 'max_depth': 4, 'n_estimators': 2496}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.791194169543509
R2_score :  0.6846157907626407


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:58:36,173][0m Trial 14 finished with value: 4.3047275109652485 and parameters: {'learning_rate': 0.045580208679295006, 'reg_lambda': 4.269465720292161, 'reg_alpha': 95.5353225451274, 'subsample': 0.2490420291766568, 'colsample_bytree': 0.6795898624820328, 'max_depth': 3, 'n_estimators': 989}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.3047275109652485
R2_score :  0.7616933575832009


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 08:58:48,379][0m Trial 15 finished with value: 4.995289434156859 and parameters: {'learning_rate': 0.014966719408486352, 'reg_lambda': 0.015391094671520136, 'reg_alpha': 1.2413569766336248, 'subsample': 0.3809086949632191, 'colsample_bytree': 0.533610399214107, 'max_depth': 4, 'n_estimators': 204}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.995289434156859
R2_score :  0.7495774682087979


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:00:52,979][0m Trial 16 finished with value: 4.243051719462243 and parameters: {'learning_rate': 0.04593885265112103, 'reg_lambda': 88.0882031076654, 'reg_alpha': 0.009811285188130726, 'subsample': 0.20974407775230022, 'colsample_bytree': 0.875570296468998, 'max_depth': 5, 'n_estimators': 1673}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.243051719462243
R2_score :  0.7392742644368074


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:03:06,459][0m Trial 17 finished with value: 4.093689252968496 and parameters: {'learning_rate': 0.024155222458682596, 'reg_lambda': 2.301706730001585, 'reg_alpha': 4.662088877381834e-05, 'subsample': 0.40407323084066116, 'colsample_bytree': 0.731074366583236, 'max_depth': 3, 'n_estimators': 2229}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  4.093689252968496
R2_score :  0.7476057198864594


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:03:40,280][0m Trial 18 finished with value: 9.070102359674655 and parameters: {'learning_rate': 0.0012930917356519175, 'reg_lambda': 0.4013732538033482, 'reg_alpha': 0.055953531181059404, 'subsample': 0.25423058984118274, 'colsample_bytree': 0.8662101703697235, 'max_depth': 4, 'n_estimators': 529}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  9.070102359674655
R2_score :  0.44163328405924074


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:04:29,924][0m Trial 19 finished with value: 5.430407336248827 and parameters: {'learning_rate': 0.04153763734194716, 'reg_lambda': 0.013079948726058456, 'reg_alpha': 0.9995965435106116, 'subsample': 0.11164312307294227, 'colsample_bytree': 0.5426943298548812, 'max_depth': 3, 'n_estimators': 1308}. Best is trial 4 with value: 4.014516335495757.[0m


Mean Absolute Error :  5.430407336248827
R2_score :  0.7386240109592743


In [97]:
"""Uncomment for tuning the model"""
import optuna
def run(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 0.1)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 6)
    n_estimators = trial.suggest_int("n_estimators", 20, 3000)


    model = xgb.XGBRegressor(
            random_state=42,
            objective='reg:squarederror',
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            eval_metric='mae'
        )
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_test)
    mae_ = mean_absolute_error(y_test, preds_valid)
    r2_sc = r2_score(y_test, preds_valid)
    print("Mean Absolute Error : ", mae_)
    print("R2_score : ", r2_sc)
    return mae_

study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=100)

# study.best_params

[32m[I 2023-04-28 09:04:30,128][0m A new study created in memory with name: no-name-0d3aee4a-db2a-445a-aee6-6a631401074b[0m
  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:04:41,170][0m Trial 0 finished with value: 4.45109362412442 and parameters: {'learning_rate': 0.07511926391905335, 'reg_lambda': 9.155477788500397, 'reg_alpha': 2.0220448757521527e-06, 'subsample': 0.485471495251942, 'colsample_bytree': 0.2838175433464417, 'max_depth': 3, 'n_estimators': 260}. Best is trial 0 with value: 4.45109362412442.[0m


Mean Absolute Error :  4.45109362412442
R2_score :  0.7820779441828376


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:04:51,238][0m Trial 1 finished with value: 4.37555884175738 and parameters: {'learning_rate': 0.09352998540410909, 'reg_lambda': 0.00036642843752606705, 'reg_alpha': 0.4009758857433482, 'subsample': 0.6332212397539065, 'colsample_bytree': 0.524477195640949, 'max_depth': 3, 'n_estimators': 192}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.37555884175738
R2_score :  0.7225718959237353


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:06:12,419][0m Trial 2 finished with value: 4.6421249747019795 and parameters: {'learning_rate': 0.08751711247805773, 'reg_lambda': 8.926641410419766, 'reg_alpha': 4.097168185791003e-08, 'subsample': 0.821061008835693, 'colsample_bytree': 0.12700196394387855, 'max_depth': 3, 'n_estimators': 2335}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.6421249747019795
R2_score :  0.7665492732905539


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:08:14,493][0m Trial 3 finished with value: 4.419209723726743 and parameters: {'learning_rate': 0.03145172256075458, 'reg_lambda': 4.999706038464734e-05, 'reg_alpha': 60.774996801999954, 'subsample': 0.551267338326362, 'colsample_bytree': 0.5609709595493059, 'max_depth': 5, 'n_estimators': 1537}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.419209723726743
R2_score :  0.7323315937345198


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:11:07,241][0m Trial 4 finished with value: 4.543365449905182 and parameters: {'learning_rate': 0.09888878048825313, 'reg_lambda': 24.855488205229097, 'reg_alpha': 0.5834662929754639, 'subsample': 0.9802581610602823, 'colsample_bytree': 0.5554147201309242, 'max_depth': 3, 'n_estimators': 2672}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.543365449905182
R2_score :  0.7398357016519148


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:13:38,257][0m Trial 5 finished with value: 5.357332845560095 and parameters: {'learning_rate': 0.06625231145468774, 'reg_lambda': 8.848086435644778e-07, 'reg_alpha': 1.9862217971987475e-07, 'subsample': 0.5544800717464493, 'colsample_bytree': 0.2980344126396455, 'max_depth': 4, 'n_estimators': 1767}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.357332845560095
R2_score :  0.7420422338459665


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:16:51,756][0m Trial 6 finished with value: 5.1697740533867504 and parameters: {'learning_rate': 0.08370992322094004, 'reg_lambda': 0.006482423000071544, 'reg_alpha': 7.607417108993234e-08, 'subsample': 0.47335763912093665, 'colsample_bytree': 0.5491432977090945, 'max_depth': 6, 'n_estimators': 1628}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.1697740533867504
R2_score :  0.7181880729554668


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:20:16,777][0m Trial 7 finished with value: 4.399734740061761 and parameters: {'learning_rate': 0.019277007954551645, 'reg_lambda': 3.7228712122524425e-05, 'reg_alpha': 1.5096281335986974e-07, 'subsample': 0.9714907644671561, 'colsample_bytree': 0.44359452647471065, 'max_depth': 5, 'n_estimators': 2662}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.399734740061761
R2_score :  0.7243442076497664


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 09:23:11,825][0m Trial 8 finished with value: 5.623420045858467 and parameters: {'learning_rate': 0.09363933420622975, 'reg_lambda': 2.215876134372027e-08, 'reg_alpha': 1.9828507172812145e-08, 'subsample': 0.53214042829062, 'colsample_bytree': 0.6078857109761135, 'max_depth': 3, 'n_estimators': 2867}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.623420045858467
R2_score :  0.6709702157996298


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 20:57:38,146][0m Trial 9 finished with value: 4.78350110575384 and parameters: {'learning_rate': 0.023361361840033373, 'reg_lambda': 2.6842680948509104e-08, 'reg_alpha': 2.174301431738352e-06, 'subsample': 0.5837255176982205, 'colsample_bytree': 0.6572166118349376, 'max_depth': 5, 'n_estimators': 2868}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.78350110575384
R2_score :  0.66309180139157


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 20:57:42,148][0m Trial 10 finished with value: 4.963308442169709 and parameters: {'learning_rate': 0.055280297416579895, 'reg_lambda': 0.010017499383590443, 'reg_alpha': 0.0075379566623527666, 'subsample': 0.23734176895651116, 'colsample_bytree': 0.839702589285737, 'max_depth': 4, 'n_estimators': 48}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.963308442169709
R2_score :  0.7128506477516869


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 20:59:29,658][0m Trial 11 finished with value: 5.141923206292742 and parameters: {'learning_rate': 0.0029367987349087354, 'reg_lambda': 0.00017443901378252373, 'reg_alpha': 0.0008930028522699851, 'subsample': 0.9999053622380785, 'colsample_bytree': 0.36904344791808086, 'max_depth': 6, 'n_estimators': 1099}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.141923206292742
R2_score :  0.7497472931596717


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:01:05,343][0m Trial 12 finished with value: 4.667377574193389 and parameters: {'learning_rate': 0.04733416631341617, 'reg_lambda': 9.448236366700265e-06, 'reg_alpha': 0.0001951386840519026, 'subsample': 0.7233303225789511, 'colsample_bytree': 0.42905839112183586, 'max_depth': 5, 'n_estimators': 915}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.667377574193389
R2_score :  0.7467026265693292


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:05:26,624][0m Trial 13 finished with value: 5.8964317790871545 and parameters: {'learning_rate': 0.0643927278916192, 'reg_lambda': 0.0013307672342664915, 'reg_alpha': 0.020823809558739532, 'subsample': 0.8356682139085756, 'colsample_bytree': 0.8214128717417577, 'max_depth': 4, 'n_estimators': 2215}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.8964317790871545
R2_score :  0.6161606793703946


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:07:28,670][0m Trial 14 finished with value: 4.822415342725868 and parameters: {'learning_rate': 0.04094568027395599, 'reg_lambda': 0.062232081534995216, 'reg_alpha': 2.1835796233173826e-05, 'subsample': 0.7548016187822557, 'colsample_bytree': 0.9876338614755396, 'max_depth': 5, 'n_estimators': 663}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.822415342725868
R2_score :  0.6522874519661634


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:10:52,191][0m Trial 15 finished with value: 4.931632162857391 and parameters: {'learning_rate': 0.0776307037492587, 'reg_lambda': 4.849245682816249e-06, 'reg_alpha': 0.20238574029532697, 'subsample': 0.8807157111762105, 'colsample_bytree': 0.4394858138331028, 'max_depth': 4, 'n_estimators': 2124}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.931632162857391
R2_score :  0.7190859172954589


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:13:29,048][0m Trial 16 finished with value: 5.063364859133622 and parameters: {'learning_rate': 0.09819158855403236, 'reg_lambda': 0.00034341496474826554, 'reg_alpha': 8.685371232187952e-05, 'subsample': 0.6763872875166711, 'colsample_bytree': 0.4762805141338949, 'max_depth': 6, 'n_estimators': 1180}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.063364859133622
R2_score :  0.7137483657870682


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:14:00,344][0m Trial 17 finished with value: 5.830158951581806 and parameters: {'learning_rate': 0.016688006508771308, 'reg_lambda': 6.846872680092222e-07, 'reg_alpha': 7.811267017168039e-06, 'subsample': 0.9073801101619346, 'colsample_bytree': 0.2137514578792299, 'max_depth': 4, 'n_estimators': 591}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.830158951581806
R2_score :  0.7534792072791402


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-28 21:17:11,041][0m Trial 18 finished with value: 4.697925384982993 and parameters: {'learning_rate': 0.041172998421319336, 'reg_lambda': 0.2246398383128713, 'reg_alpha': 6.782115019621906e-07, 'subsample': 0.3642868226790396, 'colsample_bytree': 0.3700396266689391, 'max_depth': 5, 'n_estimators': 1931}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.697925384982993
R2_score :  0.7522455232994153


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:18:29,975][0m Trial 19 finished with value: 4.98551796727635 and parameters: {'learning_rate': 0.0559156637451384, 'reg_lambda': 4.134558762817977e-05, 'reg_alpha': 3.626373817173261e-05, 'subsample': 0.6563325330397529, 'colsample_bytree': 0.6707737713717916, 'max_depth': 6, 'n_estimators': 2539}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.98551796727635
R2_score :  0.6724445628913924


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:20:59,067][0m Trial 20 finished with value: 5.625785244983675 and parameters: {'learning_rate': 0.0021799216433986378, 'reg_lambda': 0.0006548952573343785, 'reg_alpha': 1.1659993940803949e-08, 'subsample': 0.8183645303365207, 'colsample_bytree': 0.48306887793246134, 'max_depth': 3, 'n_estimators': 1302}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  5.625785244983675
R2_score :  0.7277754723430012


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:23:44,208][0m Trial 21 finished with value: 4.414182957581479 and parameters: {'learning_rate': 0.029723386540009406, 'reg_lambda': 4.5635006789963095e-05, 'reg_alpha': 57.324263689718975, 'subsample': 0.6521685229298144, 'colsample_bytree': 0.5415459716195707, 'max_depth': 5, 'n_estimators': 1530}. Best is trial 1 with value: 4.37555884175738.[0m


Mean Absolute Error :  4.414182957581479
R2_score :  0.7295748556805105


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:24:34,616][0m Trial 22 finished with value: 4.16206846602308 and parameters: {'learning_rate': 0.030899569144436076, 'reg_lambda': 5.602972645831192e-05, 'reg_alpha': 36.071524272788935, 'subsample': 0.6530854963857263, 'colsample_bytree': 0.4995139480338302, 'max_depth': 5, 'n_estimators': 585}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.16206846602308
R2_score :  0.7524943906343082


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:25:03,607][0m Trial 23 finished with value: 4.38973422855411 and parameters: {'learning_rate': 0.017163007580703913, 'reg_lambda': 0.0016049584622673757, 'reg_alpha': 5.598771983444693, 'subsample': 0.7553138125327648, 'colsample_bytree': 0.34749782748917635, 'max_depth': 5, 'n_estimators': 393}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.38973422855411
R2_score :  0.7669688466688336


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:25:26,466][0m Trial 24 finished with value: 4.452039583893925 and parameters: {'learning_rate': 0.03509657806928665, 'reg_lambda': 0.0009257683647484289, 'reg_alpha': 3.5202753065987493, 'subsample': 0.7393864112984442, 'colsample_bytree': 0.33617502750522643, 'max_depth': 4, 'n_estimators': 355}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.452039583893925
R2_score :  0.7589972403517342


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:25:29,287][0m Trial 25 finished with value: 11.022509490011672 and parameters: {'learning_rate': 0.010970652304149533, 'reg_lambda': 0.0051398126041990455, 'reg_alpha': 3.526588185877139, 'subsample': 0.6270438848786655, 'colsample_bytree': 0.23955194360159904, 'max_depth': 5, 'n_estimators': 35}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  11.022509490011672
R2_score :  0.12806585527071734


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:26:34,999][0m Trial 26 finished with value: 4.370567402891388 and parameters: {'learning_rate': 0.02699229892641566, 'reg_lambda': 0.00016100356387594708, 'reg_alpha': 11.879967461336992, 'subsample': 0.7158436554054788, 'colsample_bytree': 0.3801427786438073, 'max_depth': 6, 'n_estimators': 668}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.370567402891388
R2_score :  0.7590264123876233


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:28:01,032][0m Trial 27 finished with value: 4.386246209050042 and parameters: {'learning_rate': 0.026550288093056526, 'reg_lambda': 0.00021019593476522382, 'reg_alpha': 30.75457392537315, 'subsample': 0.6964318965838936, 'colsample_bytree': 0.39584776073201683, 'max_depth': 6, 'n_estimators': 797}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.386246209050042
R2_score :  0.7593866114558462


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:29:12,606][0m Trial 28 finished with value: 4.409622182285922 and parameters: {'learning_rate': 0.03913916647192389, 'reg_lambda': 6.844263241858075e-06, 'reg_alpha': 0.2564504982036819, 'subsample': 0.6161134313129298, 'colsample_bytree': 0.4999867077318635, 'max_depth': 6, 'n_estimators': 485}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.409622182285922
R2_score :  0.7398960823438403


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:29:44,094][0m Trial 29 finished with value: 5.452584033303764 and parameters: {'learning_rate': 0.047418805026620575, 'reg_lambda': 0.06741119800221529, 'reg_alpha': 15.307158610539174, 'subsample': 0.48505318634446026, 'colsample_bytree': 0.28656194863005585, 'max_depth': 6, 'n_estimators': 244}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  5.452584033303764
R2_score :  0.758917240880584


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:30:49,041][0m Trial 30 finished with value: 4.66906052827371 and parameters: {'learning_rate': 0.03302518646850048, 'reg_lambda': 0.00023026454461969335, 'reg_alpha': 1.412580595281687, 'subsample': 0.44361583014676687, 'colsample_bytree': 0.41351119007989745, 'max_depth': 3, 'n_estimators': 935}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.66906052827371
R2_score :  0.752124758861517


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:32:09,933][0m Trial 31 finished with value: 4.555563514582846 and parameters: {'learning_rate': 0.027658043703990165, 'reg_lambda': 0.00010758856671957315, 'reg_alpha': 22.275398701418908, 'subsample': 0.6946228704740551, 'colsample_bytree': 0.3940359931873895, 'max_depth': 6, 'n_estimators': 722}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.555563514582846
R2_score :  0.754623380046116


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:33:31,777][0m Trial 32 finished with value: 4.34299317114551 and parameters: {'learning_rate': 0.026797594821316894, 'reg_lambda': 0.0002564060457360141, 'reg_alpha': 76.6949820877262, 'subsample': 0.6145906762669788, 'colsample_bytree': 0.47278256406827796, 'max_depth': 6, 'n_estimators': 837}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.34299317114551
R2_score :  0.750086223066412


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:33:51,294][0m Trial 33 finished with value: 4.4858745930537225 and parameters: {'learning_rate': 0.0227581571982003, 'reg_lambda': 0.0018599694533991693, 'reg_alpha': 89.06895052512989, 'subsample': 0.6133987814833736, 'colsample_bytree': 0.4886165366130381, 'max_depth': 6, 'n_estimators': 202}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.4858745930537225
R2_score :  0.7551726813447495


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:35:01,726][0m Trial 34 finished with value: 5.299714149304696 and parameters: {'learning_rate': 0.036987963179830476, 'reg_lambda': 0.00046469390861853264, 'reg_alpha': 18.42646399212349, 'subsample': 0.571630535200394, 'colsample_bytree': 0.1661875440456722, 'max_depth': 6, 'n_estimators': 1028}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  5.299714149304696
R2_score :  0.7599823692970771


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:36:13,044][0m Trial 35 finished with value: 4.914944675185423 and parameters: {'learning_rate': 0.030790242788511314, 'reg_lambda': 1.748077456336655e-05, 'reg_alpha': 94.1946482090506, 'subsample': 0.7920290059179739, 'colsample_bytree': 0.32019024682464825, 'max_depth': 3, 'n_estimators': 1362}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.914944675185423
R2_score :  0.7623985806248748


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:37:09,942][0m Trial 36 finished with value: 4.80031801964555 and parameters: {'learning_rate': 0.07292816644426295, 'reg_lambda': 8.170996853619417e-05, 'reg_alpha': 6.3863989063216104, 'subsample': 0.6715318454096648, 'colsample_bytree': 0.601366615561174, 'max_depth': 6, 'n_estimators': 529}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.80031801964555
R2_score :  0.6866329071328003


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:38:21,024][0m Trial 37 finished with value: 4.996640815685249 and parameters: {'learning_rate': 0.09203017753952292, 'reg_lambda': 1.7136449551526287e-06, 'reg_alpha': 0.9388516345293361, 'subsample': 0.5404965329688156, 'colsample_bytree': 0.5224150983303405, 'max_depth': 5, 'n_estimators': 806}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.996640815685249
R2_score :  0.686964514892371


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:38:46,757][0m Trial 38 finished with value: 4.3541906510846005 and parameters: {'learning_rate': 0.0862157449837759, 'reg_lambda': 3.081623986935537e-05, 'reg_alpha': 12.427626587452988, 'subsample': 0.717426360651229, 'colsample_bytree': 0.4522452025645343, 'max_depth': 6, 'n_estimators': 246}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.3541906510846005
R2_score :  0.7445050394158561


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:39:20,848][0m Trial 39 finished with value: 5.137115658577089 and parameters: {'learning_rate': 0.08491702354015329, 'reg_lambda': 1.4690069071035517e-05, 'reg_alpha': 11.920251518394666, 'subsample': 0.7043986189044606, 'colsample_bytree': 0.27533252184161994, 'max_depth': 6, 'n_estimators': 386}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  5.137115658577089
R2_score :  0.7568061878488215


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:40:27,587][0m Trial 40 finished with value: 4.430424568482242 and parameters: {'learning_rate': 0.03326878085752285, 'reg_lambda': 3.081192224336148e-05, 'reg_alpha': 1.7356309032739714, 'subsample': 0.7847519525624512, 'colsample_bytree': 0.45673273018484306, 'max_depth': 6, 'n_estimators': 674}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.430424568482242
R2_score :  0.7424285649713435


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:40:46,092][0m Trial 41 finished with value: 4.265181703833172 and parameters: {'learning_rate': 0.09006283209368009, 'reg_lambda': 9.321571484808277e-05, 'reg_alpha': 31.371323061753866, 'subsample': 0.6184949306956964, 'colsample_bytree': 0.5159777594625803, 'max_depth': 6, 'n_estimators': 195}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.265181703833172
R2_score :  0.7335369705901376


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:40:58,061][0m Trial 42 finished with value: 4.354079127982873 and parameters: {'learning_rate': 0.0865147497506379, 'reg_lambda': 0.00010394892406704635, 'reg_alpha': 39.650541964801434, 'subsample': 0.5932648985888622, 'colsample_bytree': 0.4426836840816764, 'max_depth': 6, 'n_estimators': 137}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.354079127982873
R2_score :  0.7511735657051752


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:41:08,756][0m Trial 43 finished with value: 4.3083706432649675 and parameters: {'learning_rate': 0.08771586432379344, 'reg_lambda': 7.052150522155923e-05, 'reg_alpha': 29.570415043362704, 'subsample': 0.5744740784166902, 'colsample_bytree': 0.51244913218334, 'max_depth': 6, 'n_estimators': 117}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.3083706432649675
R2_score :  0.738745116602817


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:41:21,380][0m Trial 44 finished with value: 4.274497647648274 and parameters: {'learning_rate': 0.08168787891264208, 'reg_lambda': 0.00012961646073970544, 'reg_alpha': 36.810909382562116, 'subsample': 0.5908565765915298, 'colsample_bytree': 0.548903554846571, 'max_depth': 6, 'n_estimators': 138}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.274497647648274
R2_score :  0.7378594044683048


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:41:23,883][0m Trial 45 finished with value: 5.816274106853026 and parameters: {'learning_rate': 0.08065838432965464, 'reg_lambda': 3.142692032505424e-06, 'reg_alpha': 91.18162004176678, 'subsample': 0.5122890221061458, 'colsample_bytree': 0.578828205851809, 'max_depth': 6, 'n_estimators': 22}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  5.816274106853026
R2_score :  0.698224167874806


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:41:54,161][0m Trial 46 finished with value: 4.40616087264129 and parameters: {'learning_rate': 0.08961856567055707, 'reg_lambda': 1.3442497528121747e-05, 'reg_alpha': 39.16010131426144, 'subsample': 0.5710589047911794, 'colsample_bytree': 0.5407462311860841, 'max_depth': 5, 'n_estimators': 399}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.40616087264129
R2_score :  0.712649442023616


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:42:48,454][0m Trial 47 finished with value: 4.808260285803529 and parameters: {'learning_rate': 0.09419455924601666, 'reg_lambda': 0.0004282210733403822, 'reg_alpha': 2.4358525883842916, 'subsample': 0.6402855736737838, 'colsample_bytree': 0.5140981034807655, 'max_depth': 6, 'n_estimators': 521}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.808260285803529
R2_score :  0.7060323865364072


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:43:02,188][0m Trial 48 finished with value: 4.574980276997106 and parameters: {'learning_rate': 0.09856168657460923, 'reg_lambda': 6.554396515004641e-05, 'reg_alpha': 0.8901672790915536, 'subsample': 0.5862663826543902, 'colsample_bytree': 0.6365253899645124, 'max_depth': 5, 'n_estimators': 158}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.574980276997106
R2_score :  0.6795893738667056


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:43:36,151][0m Trial 49 finished with value: 4.608285965194395 and parameters: {'learning_rate': 0.08080628170643483, 'reg_lambda': 4.879713547842803e-07, 'reg_alpha': 0.33116907261196377, 'subsample': 0.5076610399444756, 'colsample_bytree': 0.5801344109458466, 'max_depth': 6, 'n_estimators': 337}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.608285965194395
R2_score :  0.7028053839908068


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:43:51,209][0m Trial 50 finished with value: 4.382142151832527 and parameters: {'learning_rate': 0.08953951989549665, 'reg_lambda': 3.0679493383562366e-06, 'reg_alpha': 5.598254553193769, 'subsample': 0.5454781566305231, 'colsample_bytree': 0.5558117398564005, 'max_depth': 6, 'n_estimators': 138}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.382142151832527
R2_score :  0.7293401201117835


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:44:07,551][0m Trial 51 finished with value: 4.320723346542186 and parameters: {'learning_rate': 0.07217876635566452, 'reg_lambda': 9.938145268331484e-05, 'reg_alpha': 27.32063502793319, 'subsample': 0.6125282834707884, 'colsample_bytree': 0.42853210182187357, 'max_depth': 6, 'n_estimators': 147}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.320723346542186
R2_score :  0.7677741584150275


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:44:44,802][0m Trial 52 finished with value: 4.322049629897842 and parameters: {'learning_rate': 0.07206493348975067, 'reg_lambda': 0.00016549610518195129, 'reg_alpha': 36.30551298196829, 'subsample': 0.637520161424352, 'colsample_bytree': 0.5031895503289158, 'max_depth': 6, 'n_estimators': 319}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.322049629897842
R2_score :  0.7322908670175112


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:45:10,019][0m Trial 53 finished with value: 4.298606277181556 and parameters: {'learning_rate': 0.07364154180621085, 'reg_lambda': 9.951333906093892e-05, 'reg_alpha': 32.391712534429786, 'subsample': 0.6525482827897798, 'colsample_bytree': 0.5038705042596235, 'max_depth': 6, 'n_estimators': 267}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.298606277181556
R2_score :  0.7353061547116665


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:45:19,035][0m Trial 54 finished with value: 4.4538692815233825 and parameters: {'learning_rate': 0.07713435428620194, 'reg_lambda': 2.9215850748934527e-05, 'reg_alpha': 12.361320695254099, 'subsample': 0.6668167588121499, 'colsample_bytree': 0.416951691937145, 'max_depth': 6, 'n_estimators': 105}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.4538692815233825
R2_score :  0.771260576853233


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:45:40,354][0m Trial 55 finished with value: 4.448015731890351 and parameters: {'learning_rate': 0.08182598128917774, 'reg_lambda': 6.687611975465178e-05, 'reg_alpha': 5.350714066492331, 'subsample': 0.5871622606316635, 'colsample_bytree': 0.5205567513749965, 'max_depth': 6, 'n_estimators': 237}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.448015731890351
R2_score :  0.7301259268573115


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:45:42,483][0m Trial 56 finished with value: 6.622156958928079 and parameters: {'learning_rate': 0.06946070738815664, 'reg_lambda': 8.735964381424693e-06, 'reg_alpha': 28.4274477414945, 'subsample': 0.5495753407608353, 'colsample_bytree': 0.6193310565916752, 'max_depth': 5, 'n_estimators': 20}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  6.622156958928079
R2_score :  0.6391540693090769


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:46:38,399][0m Trial 57 finished with value: 4.443443428703982 and parameters: {'learning_rate': 0.06636236857798948, 'reg_lambda': 0.0006815589776174062, 'reg_alpha': 0.5536176939304908, 'subsample': 0.6589886772655184, 'colsample_bytree': 0.5730345182255399, 'max_depth': 6, 'n_estimators': 462}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.443443428703982
R2_score :  0.7179695101266745


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:47:30,159][0m Trial 58 finished with value: 4.635923459468388 and parameters: {'learning_rate': 0.07759044545599864, 'reg_lambda': 0.0035001045818649807, 'reg_alpha': 0.0915880119794646, 'subsample': 0.6065205704884374, 'colsample_bytree': 0.46516654652772643, 'max_depth': 5, 'n_estimators': 585}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.635923459468388
R2_score :  0.7230579096122622


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:47:56,273][0m Trial 59 finished with value: 4.643469812510305 and parameters: {'learning_rate': 0.08264363152171751, 'reg_lambda': 0.0007966036792914367, 'reg_alpha': 2.4060397534826166, 'subsample': 0.5195695991434268, 'colsample_bytree': 0.4270486615478562, 'max_depth': 6, 'n_estimators': 261}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.643469812510305
R2_score :  0.7497860095660704


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:51:10,783][0m Trial 60 finished with value: 4.497823732625809 and parameters: {'learning_rate': 0.062402195410235915, 'reg_lambda': 7.151970795658751e-05, 'reg_alpha': 41.372774816790105, 'subsample': 0.6772116561208983, 'colsample_bytree': 0.6732227646731582, 'max_depth': 6, 'n_estimators': 1782}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.497823732625809
R2_score :  0.6931391611806134


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:51:37,794][0m Trial 61 finished with value: 4.273382612810257 and parameters: {'learning_rate': 0.07348002617117197, 'reg_lambda': 0.00015649057066246857, 'reg_alpha': 32.34496171947777, 'subsample': 0.6299800502212085, 'colsample_bytree': 0.5004074588295053, 'max_depth': 6, 'n_estimators': 298}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.273382612810257
R2_score :  0.7130715305702116


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:51:49,354][0m Trial 62 finished with value: 4.2632241002943445 and parameters: {'learning_rate': 0.07523083382832856, 'reg_lambda': 0.0002509195927318387, 'reg_alpha': 8.078226277390751, 'subsample': 0.6288097577757339, 'colsample_bytree': 0.5253791695278762, 'max_depth': 6, 'n_estimators': 124}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.2632241002943445
R2_score :  0.7326839584494931


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:52:39,184][0m Trial 63 finished with value: 4.547632667369512 and parameters: {'learning_rate': 0.0762258231355543, 'reg_lambda': 0.000340741999037839, 'reg_alpha': 7.121707433651155, 'subsample': 0.6456641151812221, 'colsample_bytree': 0.524512656644265, 'max_depth': 6, 'n_estimators': 421}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.547632667369512
R2_score :  0.7238210165339498


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:53:15,057][0m Trial 64 finished with value: 4.448519823047109 and parameters: {'learning_rate': 0.08027689241948374, 'reg_lambda': 0.0017809449955525874, 'reg_alpha': 9.512855552006748, 'subsample': 0.5639793771436592, 'colsample_bytree': 0.49158663571069644, 'max_depth': 6, 'n_estimators': 303}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.448519823047109
R2_score :  0.7328386718224194


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:53:49,922][0m Trial 65 finished with value: 4.331125732778508 and parameters: {'learning_rate': 0.08438462575444312, 'reg_lambda': 0.00017816963665836125, 'reg_alpha': 4.02396355583523, 'subsample': 0.7381117761903162, 'colsample_bytree': 0.5552328563762057, 'max_depth': 4, 'n_estimators': 559}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.331125732778508
R2_score :  0.7022662882998613


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:54:00,700][0m Trial 66 finished with value: 4.254392855450327 and parameters: {'learning_rate': 0.07555500224775527, 'reg_lambda': 2.309574248681351e-05, 'reg_alpha': 99.91990480265024, 'subsample': 0.6903191881378571, 'colsample_bytree': 0.594115371295488, 'max_depth': 6, 'n_estimators': 104}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.254392855450327
R2_score :  0.7323209321883501


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:54:28,843][0m Trial 67 finished with value: 4.579813424204215 and parameters: {'learning_rate': 0.07390438155253846, 'reg_lambda': 2.52701259438616e-05, 'reg_alpha': 56.59172936958021, 'subsample': 0.6866183726025779, 'colsample_bytree': 0.5987200755472759, 'max_depth': 6, 'n_estimators': 286}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.579813424204215
R2_score :  0.7197361484540286


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:54:48,711][0m Trial 68 finished with value: 4.196502887233966 and parameters: {'learning_rate': 0.07001527123734705, 'reg_lambda': 0.00038791894306115473, 'reg_alpha': 98.95371012330142, 'subsample': 0.628179169509355, 'colsample_bytree': 0.5462755630113153, 'max_depth': 6, 'n_estimators': 215}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.196502887233966
R2_score :  0.7258173367482088


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:55:29,190][0m Trial 69 finished with value: 4.334381403229536 and parameters: {'learning_rate': 0.06818252156129706, 'reg_lambda': 0.0004143393280760942, 'reg_alpha': 87.98017018768107, 'subsample': 0.6322079294312514, 'colsample_bytree': 0.6430902014655004, 'max_depth': 5, 'n_estimators': 438}. Best is trial 22 with value: 4.16206846602308.[0m


Mean Absolute Error :  4.334381403229536
R2_score :  0.7179745159099504


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:55:54,830][0m Trial 70 finished with value: 4.144307145683879 and parameters: {'learning_rate': 0.060874267198799376, 'reg_lambda': 0.0008196424699002651, 'reg_alpha': 14.698503071138058, 'subsample': 0.7005947696689363, 'colsample_bytree': 0.5442856655981725, 'max_depth': 6, 'n_estimators': 210}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.144307145683879
R2_score :  0.7411635677250943


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:56:05,345][0m Trial 71 finished with value: 4.216353911262755 and parameters: {'learning_rate': 0.06209686627923121, 'reg_lambda': 0.00108033870884223, 'reg_alpha': 14.425368269472887, 'subsample': 0.7047570169460033, 'colsample_bytree': 0.5430695173707097, 'max_depth': 6, 'n_estimators': 83}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.216353911262755
R2_score :  0.7447629844832471


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:56:14,010][0m Trial 72 finished with value: 4.442927967867167 and parameters: {'learning_rate': 0.0620082815017137, 'reg_lambda': 0.0012478463538253914, 'reg_alpha': 13.813313670717967, 'subsample': 0.6866226413767275, 'colsample_bytree': 0.5820897713055916, 'max_depth': 6, 'n_estimators': 63}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.442927967867167
R2_score :  0.7496117907131414


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:56:36,924][0m Trial 73 finished with value: 4.351794957377905 and parameters: {'learning_rate': 0.06142274043378958, 'reg_lambda': 0.0035207884187649723, 'reg_alpha': 17.913302035804644, 'subsample': 0.7196935931580135, 'colsample_bytree': 0.4655435459146039, 'max_depth': 6, 'n_estimators': 202}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.351794957377905
R2_score :  0.7381589943289952


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 15:57:18,136][0m Trial 74 finished with value: 4.3710426334684955 and parameters: {'learning_rate': 0.06866456407534142, 'reg_lambda': 0.0008568117395720759, 'reg_alpha': 7.755173050362583, 'subsample': 0.7497106171625239, 'colsample_bytree': 0.534105029776069, 'max_depth': 6, 'n_estimators': 347}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.3710426334684955
R2_score :  0.7095465847139789


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 16:02:32,661][0m Trial 75 finished with value: 5.194821686721342 and parameters: {'learning_rate': 0.058996378860857235, 'reg_lambda': 0.00028986400881725116, 'reg_alpha': 3.6244787525150906, 'subsample': 0.7173318377412843, 'colsample_bytree': 0.6126226603703888, 'max_depth': 6, 'n_estimators': 2960}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  5.194821686721342
R2_score :  0.6597540545776153


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 16:03:36,643][0m Trial 76 finished with value: 4.379539755331345 and parameters: {'learning_rate': 0.06517790066148384, 'reg_lambda': 0.0005686459283779009, 'reg_alpha': 64.84556852625104, 'subsample': 0.6956691489135968, 'colsample_bytree': 0.47933883777638336, 'max_depth': 6, 'n_estimators': 620}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.379539755331345
R2_score :  0.7394218008012313


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 16:03:53,513][0m Trial 77 finished with value: 4.32725933895831 and parameters: {'learning_rate': 0.05443444243009719, 'reg_lambda': 3.931702928280631e-05, 'reg_alpha': 20.462047635324122, 'subsample': 0.7647378430226827, 'colsample_bytree': 0.5596644869359695, 'max_depth': 6, 'n_estimators': 190}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.32725933895831
R2_score :  0.7385981763015244


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 16:03:58,736][0m Trial 78 finished with value: 4.415439169207173 and parameters: {'learning_rate': 0.06659804350711579, 'reg_lambda': 0.011105218071647164, 'reg_alpha': 93.73460808896282, 'subsample': 0.6640237201920987, 'colsample_bytree': 0.541755651027377, 'max_depth': 4, 'n_estimators': 77}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.415439169207173
R2_score :  0.7454738318076641


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[32m[I 2023-04-29 16:07:34,211][0m Trial 79 finished with value: 4.813531152463998 and parameters: {'learning_rate': 0.07035064337196545, 'reg_lambda': 0.0002147660371873115, 'reg_alpha': 1.614675960285272, 'subsample': 0.627188199496903, 'colsample_bytree': 0.4919196737316733, 'max_depth': 6, 'n_estimators': 2451}. Best is trial 70 with value: 4.144307145683879.[0m


Mean Absolute Error :  4.813531152463998
R2_score :  0.7154486232380166


  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)


In [1]:
"""Uncomment for tuning the model"""
import optuna
def run(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 0.1)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 6)
    n_estimators = trial.suggest_int("n_estimators", 20, 3000)


    model = xgb.XGBRegressor(
            random_state=42,
            objective='reg:squarederror',
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            eval_metric='r2_score'
        )
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_test)
    mae_ = mean_absolute_error(y_test, preds_valid)
    r2_sc = r2_score(y_test, preds_valid)
    print("Mean Absolute Error : ", mae_)
    print("R2_score : ", r2_sc)
    return r2_sc

study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=100)

# study.best_params

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-04-30 20:29:28,143][0m A new study created in memory with name: no-name-479de14e-c7bb-45e0-8584-c0aa19e972fd[0m
  reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
  reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
[33m[W 2023-04-30 20:29:28,150][0m Trial 0 failed with parameters: {'learning_rate': 0.07791084133262398, 'reg_lambda': 0.0007188067232693896, 'reg_alpha': 1.823825918142417e-08, 'subsample': 0.6392031163550392, 'colsample_bytree': 0.8021660748450357, 'max_depth': 5, 'n_estimators': 1277} because of the following error: NameError("name 'xgb' is not defined").[0m
Traceback (most recent call last):
  File "c:\Users\nshre\anaconda3\envs\auto_gpt\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\nshre\AppData\Local\Temp\ipykernel_15036\4217039969.py", line 13, in run
    model = xgb.XGBRegressor(
NameError: name '

NameError: name 'xgb' is not defined