In [1]:
import numpy as np
import pandas as pd
import catboost as cb
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, AdaBoostRegressor, BaggingRegressor

In [2]:
data = pd.read_csv('data_no_mising_value_label_encoder_col.csv')
data.head()

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Length,Width,Diameter,Display Type,...,3.5mm jack,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM,ratio_1,ratio_2
0,0,True,True,False,2022,172.0,146.7,71.9,10.0,7,...,True,4,2.0,32.0,False,False,True,False,18.0,9.0
1,0,True,True,False,2021,190.0,156.4,74.8,9.7,1,...,True,7,2.0,32.0,False,False,True,False,5.0,9.0
2,0,True,True,False,2021,134.0,137.6,65.7,9.8,7,...,True,4,1.0,8.0,False,False,True,False,18.0,9.0
3,0,True,True,False,2021,194.0,165.6,75.6,8.7,1,...,True,4,4.0,64.0,False,False,True,False,20.0,9.0
4,0,True,True,False,2021,190.0,165.6,75.6,8.8,1,...,True,4,3.0,32.0,False,False,True,False,20.0,9.0


In [3]:
data.drop(['ratio_1', 'ratio_2', 'Loudspeaker', '3.5mm jack', 'Length', 'Width', 'Diameter'], axis=1, inplace=True)
data.head()

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Display Type,Display Size,ppi,body ratio,...,Price,CPU,pixel,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM
0,0,True,True,False,2022,172.0,7,5.5,293.0,74.0,...,100.0,4.0,1036800.0,4,2.0,32.0,False,False,True,False
1,0,True,True,False,2021,190.0,1,6.1,282.0,78.1,...,110.0,8.0,1123200.0,7,2.0,32.0,False,False,True,False
2,0,True,True,False,2021,134.0,7,5.0,215.0,71.4,...,60.0,4.0,460800.0,4,1.0,8.0,False,False,True,False
3,0,True,True,False,2021,194.0,1,6.52,269.0,82.0,...,330.0,8.0,1152000.0,4,4.0,64.0,False,False,True,False
4,0,True,True,False,2021,190.0,1,6.52,269.0,82.0,...,130.0,8.0,1152000.0,4,3.0,32.0,False,False,True,False


In [4]:
data.columns

Index(['brand', '3G', '4G', '5G', 'Announced', 'Weight', 'Display Type',
       'Display Size', 'ppi', 'body ratio', 'OS', 'battery_capacity', 'Price',
       'CPU', 'pixel', 'Chipset', 'RAM', 'Storage', 'Micro-SIM', 'Mini-SIM',
       'Nano-SIM', 'eSIM'],
      dtype='object')

In [5]:
X = data.drop('Price', axis=1)
y = data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [6]:
Announced_scaler = MinMaxScaler()
X_train['Announced'] = Announced_scaler.fit_transform(X_train[['Announced']])
X_val['Announced'] = Announced_scaler.transform(X_val[['Announced']])
X_test['Announced'] = Announced_scaler.transform(X_test[['Announced']])

Weight_scaler = MinMaxScaler()
X_train['Weight'] = Weight_scaler.fit_transform(X_train[['Weight']])
X_val['Weight'] = Weight_scaler.transform(X_val[['Weight']])
X_test['Weight'] = Weight_scaler.transform(X_test[['Weight']])

Size_scaler = MinMaxScaler()
X_train['Display Size'] = Size_scaler.fit_transform(X_train[['Display Size']])
X_val['Display Size'] = Size_scaler.transform(X_val[['Display Size']])
X_test['Display Size'] = Size_scaler.transform(X_test[['Display Size']])

ppi_scaler = MinMaxScaler()
X_train['ppi'] = ppi_scaler.fit_transform(X_train[['ppi']])
X_val['ppi'] = ppi_scaler.transform(X_val[['ppi']])
X_test['ppi'] = ppi_scaler.transform(X_test[['ppi']])

ratio_scaler = MinMaxScaler()
X_train['body ratio'] = ratio_scaler.fit_transform(X_train[['body ratio']])
X_val['body ratio'] = ratio_scaler.transform(X_val[['body ratio']])
X_test['body ratio'] = ratio_scaler.transform(X_test[['body ratio']])

battery_scaler = MinMaxScaler()
X_train['battery_capacity'] = battery_scaler.fit_transform(X_train[['battery_capacity']])
X_val['battery_capacity'] = battery_scaler.transform(X_val[['battery_capacity']])
X_test['battery_capacity'] = battery_scaler.transform(X_test[['battery_capacity']])

CPU_scaler = MinMaxScaler()
X_train['CPU'] = CPU_scaler.fit_transform(X_train[['CPU']])
X_val['CPU'] = CPU_scaler.transform(X_val[['CPU']])
X_test['CPU'] = CPU_scaler.transform(X_test[['CPU']])

pixel_scaler = MinMaxScaler()
X_train['pixel'] = pixel_scaler.fit_transform(X_train[['pixel']])
X_val['pixel'] = pixel_scaler.transform(X_val[['pixel']])
X_test['pixel'] = pixel_scaler.transform(X_test[['pixel']])

Storage_scaler = MinMaxScaler()
X_train['Storage'] = Storage_scaler.fit_transform(X_train[['Storage']])
X_val['Storage'] = Storage_scaler.transform(X_val[['Storage']])
X_test['Storage'] = Storage_scaler.transform(X_test[['Storage']])

RAM_scaler = MinMaxScaler()
X_train['RAM'] = RAM_scaler.fit_transform(X_train[['RAM']])
X_val['RAM'] = RAM_scaler.transform(X_val[['RAM']])
X_test['RAM'] = RAM_scaler.transform(X_test[['RAM']])

y_train = np.log(y_train)
y_val = np.log(y_val)
y_test = np.log(y_test)

In [7]:
def preprocessing(X, y):
    X['Announced'] = Announced_scaler.transform(X[['Announced']])
    X['Weight'] = Weight_scaler.transform(X[['Weight']])
    X['Display Size'] = Size_scaler.transform(X[['Display Size']])
    X['ppi'] = ppi_scaler.transform(X[['ppi']])
    X['body ratio'] = ratio_scaler.transform(X[['body ratio']])
    X['battery_capacity'] = battery_scaler.transform(X[['battery_capacity']])
    X['CPU'] = CPU_scaler.transform(X[['CPU']])
    X['pixel'] = pixel_scaler.transform(X[['pixel']])
    X['Storage'] = Storage_scaler.transform(X[['Storage']])
    X['RAM'] = RAM_scaler.transform(X[['RAM']])
    y = np.log(y)
    return (X, Y)

In [8]:
X_train

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Display Type,Display Size,ppi,body ratio,...,battery_capacity,CPU,pixel,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM
1568,13,True,True,False,0.428571,0.047897,1,0.213115,0.252187,0.614925,...,0.212858,0.250000,0.102804,4,0.117459,0.030991,False,False,True,False
1035,8,True,True,False,0.571429,0.072430,1,0.303279,0.644315,0.832836,...,0.256299,0.750000,0.537383,6,0.243536,0.062250,False,False,True,False
1498,13,True,True,False,0.714286,0.077103,7,0.250000,0.253644,0.691045,...,0.226759,0.250000,0.116822,4,0.117459,0.062250,False,False,True,False
1224,9,True,True,False,0.785714,0.129322,1,0.336066,0.217201,0.808955,...,0.425717,0.750000,0.130841,7,0.180497,0.062250,False,False,True,False
830,7,True,True,False,0.857143,0.427570,1,0.631148,0.150146,0.756716,...,0.438749,0.750000,0.271028,7,0.180497,0.062250,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,5,True,True,False,1.000000,0.132009,7,0.356557,0.202624,0.850746,...,0.516942,0.750000,0.130841,3,0.495691,0.249800,False,False,True,False
868,7,True,True,False,0.571429,0.082944,1,0.311475,0.409621,0.843284,...,0.282363,0.750000,0.285835,6,0.369613,0.124766,False,False,True,False
982,7,True,False,False,0.214286,0.357477,7,0.377049,0.071429,0.438806,...,0.304083,0.480976,0.065421,4,0.054420,0.030991,False,True,False,False
1256,9,True,True,False,0.642857,0.051869,1,0.250000,0.110787,0.716418,...,0.212858,0.250000,0.046729,4,0.054420,0.015362,False,False,True,False


In [9]:
cat_r = cb.CatBoostRegressor(learning_rate=0.05, depth=6, eval_metric='MAE')
cat_r.fit(X_train, y_train, [0, 6, 10, 14], eval_set=(X_val, y_val), plot=True)
y_val_pred = cat_r.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4849545	test: 0.4658029	best: 0.4658029 (0)	total: 219ms	remaining: 3m 38s
1:	learn: 0.4723138	test: 0.4538514	best: 0.4538514 (1)	total: 300ms	remaining: 2m 29s
2:	learn: 0.4621208	test: 0.4443130	best: 0.4443130 (2)	total: 370ms	remaining: 2m 2s
3:	learn: 0.4497645	test: 0.4340441	best: 0.4340441 (3)	total: 451ms	remaining: 1m 52s
4:	learn: 0.4387538	test: 0.4240383	best: 0.4240383 (4)	total: 537ms	remaining: 1m 46s
5:	learn: 0.4297993	test: 0.4145576	best: 0.4145576 (5)	total: 611ms	remaining: 1m 41s
6:	learn: 0.4202294	test: 0.4060111	best: 0.4060111 (6)	total: 677ms	remaining: 1m 36s
7:	learn: 0.4115972	test: 0.3973158	best: 0.3973158 (7)	total: 740ms	remaining: 1m 31s
8:	learn: 0.4042135	test: 0.3901968	best: 0.3901968 (8)	total: 809ms	remaining: 1m 29s
9:	learn: 0.3966700	test: 0.3834293	best: 0.3834293 (9)	total: 878ms	remaining: 1m 26s
10:	learn: 0.3897961	test: 0.3775047	best: 0.3775047 (10)	total: 947ms	remaining: 1m 25s
11:	learn: 0.3824406	test: 0.3716636	best:

In [10]:
xgb_r = xgb.XGBRegressor(n_estimators=1000)
xgb_r.fit(X_train, y_train)
y_val_pred = xgb_r.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.6999965512512991
mean_absolute_error: 0.24943778994393037
0.5968479260266182


In [11]:
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train, y_train)
y_val_pred = knn_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.47495901579852307
mean_absolute_error: 0.31976831211991374
0.5968479260266182


In [12]:
svr_reg = SVR()
svr_reg.fit(X_train, y_train)
y_val_pred = svr_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.5478786245376944
mean_absolute_error: 0.3126197164350887
0.5968479260266182


In [13]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
y_val_pred = rf_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.7002365973874167
mean_absolute_error: 0.2478801813527851
0.5968479260266182


In [14]:
ab_reg = AdaBoostRegressor()
ab_reg.fit(X_train, y_train)
y_val_pred = ab_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.6122489858878206
mean_absolute_error: 0.2962193968828135
0.5968479260266182


In [15]:
models = [
    ('xgb_reg1', xgb_r),
    ('rf_reg1', rf_reg),
    ('rf_reg2', rf_reg),
    ('xgb_reg2', xgb_r)
]
sr = StackingRegressor(estimators=models, final_estimator=cat_r)
sr.fit(X_train, y_train)
y_val_pred = ab_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

0:	learn: 0.4785152	total: 9.69ms	remaining: 9.68s
1:	learn: 0.4615754	total: 19ms	remaining: 9.49s
2:	learn: 0.4459377	total: 26.1ms	remaining: 8.67s
3:	learn: 0.4312691	total: 32.5ms	remaining: 8.08s
4:	learn: 0.4172244	total: 36ms	remaining: 7.16s
5:	learn: 0.4043284	total: 39.5ms	remaining: 6.54s
6:	learn: 0.3922432	total: 44ms	remaining: 6.24s
7:	learn: 0.3806992	total: 47.2ms	remaining: 5.85s
8:	learn: 0.3700006	total: 51.8ms	remaining: 5.71s
9:	learn: 0.3596462	total: 55.2ms	remaining: 5.46s
10:	learn: 0.3500317	total: 60.2ms	remaining: 5.42s
11:	learn: 0.3409991	total: 64ms	remaining: 5.26s
12:	learn: 0.3326785	total: 68.2ms	remaining: 5.18s
13:	learn: 0.3249088	total: 71.6ms	remaining: 5.04s
14:	learn: 0.3176887	total: 76.6ms	remaining: 5.03s
15:	learn: 0.3109561	total: 80.4ms	remaining: 4.94s
16:	learn: 0.3048781	total: 84.4ms	remaining: 4.88s
17:	learn: 0.2993311	total: 87.6ms	remaining: 4.78s
18:	learn: 0.2940087	total: 92.3ms	remaining: 4.77s
19:	learn: 0.2889571	total: 96

In [16]:
br = BaggingRegressor(cat_r)
br.fit(X_train, y_train)
y_val_pred = sr.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

0:	learn: 0.4808471	total: 6.19ms	remaining: 6.18s


1:	learn: 0.4699525	total: 11.1ms	remaining: 5.52s
2:	learn: 0.4589149	total: 19.9ms	remaining: 6.6s
3:	learn: 0.4487471	total: 27.9ms	remaining: 6.94s
4:	learn: 0.4398876	total: 36.2ms	remaining: 7.2s
5:	learn: 0.4305966	total: 41ms	remaining: 6.79s
6:	learn: 0.4222599	total: 46.4ms	remaining: 6.59s
7:	learn: 0.4141963	total: 51.2ms	remaining: 6.35s
8:	learn: 0.4059736	total: 56.7ms	remaining: 6.24s
9:	learn: 0.3980943	total: 63.7ms	remaining: 6.31s
10:	learn: 0.3912026	total: 69.4ms	remaining: 6.24s
11:	learn: 0.3838245	total: 73.9ms	remaining: 6.08s
12:	learn: 0.3784386	total: 79ms	remaining: 6s
13:	learn: 0.3714761	total: 84.7ms	remaining: 5.97s
14:	learn: 0.3673118	total: 90.1ms	remaining: 5.92s
15:	learn: 0.3620175	total: 94.8ms	remaining: 5.83s
16:	learn: 0.3570927	total: 103ms	remaining: 5.97s
17:	learn: 0.3529695	total: 110ms	remaining: 6.03s
18:	learn: 0.3497367	total: 115ms	remaining: 5.95s
19:	learn: 0.3458113	total: 120ms	remaining: 5.88s
20:	learn: 0.3407832	total: 124ms	

In [17]:
y_test_pred = cat_r.predict(X_test)
print('r2_score:', r2_score(y_test, y_test_pred))
print('mean_absolute_error:', mean_absolute_error(y_test, y_test_pred))
pred = list()
for i in range(len(y_test)):
    pred.append((np.exp(y_test.values[i]), np.exp(y_test_pred[i])))
pred = pd.DataFrame(pred, columns=['y_test', 'y_test_pred'])
print('r2_score:', r2_score(pred.y_test, pred.y_test_pred))
print('mean_absolute_error:', mean_absolute_error(pred.y_test, pred.y_test_pred))
print('y_test std:', pred.y_test.std())
pred

r2_score: 0.7409832713410833
mean_absolute_error: 0.22291729145267272
r2_score: 0.7467880620404357
mean_absolute_error: 53.159273941128134
y_test std: 164.26058565961893


Unnamed: 0,y_test,y_test_pred
0,180.0,173.752167
1,600.0,710.623224
2,263.0,338.318934
3,140.0,97.368839
4,130.0,144.153966
...,...,...
348,160.0,140.528191
349,280.0,194.692784
350,280.0,378.278142
351,160.0,109.402424


In [18]:
cat_r.get_feature_importance(prettified=True).sort_values('Importances', ascending=False)

Unnamed: 0,Feature Id,Importances
0,pixel,19.300223
1,Announced,12.878484
2,RAM,10.69394
3,Chipset,8.491167
4,Storage,6.874504
5,Display Type,6.756463
6,brand,6.519497
7,Weight,6.375972
8,ppi,4.578493
9,battery_capacity,3.93586


In [19]:
cat_r = cb.CatBoostRegressor(eval_metric='MAE')
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = cat_r.grid_search(grid,
                                       X=X_train,
                                       y=y_train,
                                       plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 5.1837176	test: 5.1745258	best: 5.1745258 (0)	total: 7.16ms	remaining: 7.15s
1:	learn: 5.0295123	test: 5.0202162	best: 5.0202162 (1)	total: 9.51ms	remaining: 4.74s
2:	learn: 4.8794804	test: 4.8698754	best: 4.8698754 (2)	total: 11.7ms	remaining: 3.89s
3:	learn: 4.7341746	test: 4.7238150	best: 4.7238150 (3)	total: 13.6ms	remaining: 3.37s
4:	learn: 4.5931281	test: 4.5822294	best: 4.5822294 (4)	total: 15.5ms	remaining: 3.08s
5:	learn: 4.4567207	test: 4.4459112	best: 4.4459112 (5)	total: 17.3ms	remaining: 2.86s
6:	learn: 4.3241201	test: 4.3134885	best: 4.3134885 (6)	total: 21.2ms	remaining: 3.01s
7:	learn: 4.1953799	test: 4.1845034	best: 4.1845034 (7)	total: 23.5ms	remaining: 2.91s
8:	learn: 4.0698990	test: 4.0588937	best: 4.0588937 (8)	total: 24.8ms	remaining: 2.73s
9:	learn: 3.9485917	test: 3.9377578	best: 3.9377578 (9)	total: 27.2ms	remaining: 2.69s
10:	learn: 3.8310975	test: 3.8199390	best: 3.8199390 (10)	total: 29.6ms	remaining: 2.66s
11:	learn: 3.7173572	test: 3.7063039	best

In [20]:
y_test_pred = cat_r.predict(X_test)
print('r2_score:', r2_score(y_test, y_test_pred))
print('mean_absolute_error:', mean_absolute_error(y_test, y_test_pred))
pred = list()
for i in range(len(y_test)):
    pred.append((np.exp(y_test.values[i]), np.exp(y_test_pred[i])))
pred = pd.DataFrame(pred, columns=['y_test', 'y_test_pred'])
print('r2_score:', r2_score(pred.y_test, pred.y_test_pred))
print('mean_absolute_error:', mean_absolute_error(pred.y_test, pred.y_test_pred))
print('y_test std:', pred.y_test.std())
pred

r2_score: 0.7548594244989502
mean_absolute_error: 0.2147429852012036
r2_score: 0.7474678794132836
mean_absolute_error: 51.66680648725416
y_test std: 164.26058565961893


Unnamed: 0,y_test,y_test_pred
0,180.0,163.926913
1,600.0,609.947452
2,263.0,327.552715
3,140.0,112.164511
4,130.0,126.968804
...,...,...
348,160.0,139.290327
349,280.0,191.384475
350,280.0,322.638420
351,160.0,116.425196


In [21]:
cat_r.get_feature_importance(prettified=True).sort_values('Importances', ascending=False)

Unnamed: 0,Feature Id,Importances
0,pixel,19.501367
1,Announced,14.228505
2,RAM,12.450853
3,Weight,6.852773
4,brand,6.421965
5,ppi,6.409272
6,Storage,6.321784
7,body ratio,5.503526
8,battery_capacity,5.092503
9,Chipset,4.495258
