# Setup

In [1]:
%%capture
!pip install --upgrade optuna_integration

In [2]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

import optuna.integration.lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

SEED = 2024

In [3]:
DATA_DIR = '/kaggle/input/autoam-car-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data overview

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB


In [5]:
train.shape, test.shape, sample_sub.shape

((1642, 10), (411, 10), (411, 2))

In [6]:
train.sample()

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
1019,nissan,2020,petrol,18000 km,left,gray,suv,excellent,2.0,20700


In [7]:
test.sample()

Unnamed: 0,Id,model,year,motor_type,running,wheel,color,type,status,motor_volume
265,265,toyota,2020,petrol,63000 miles,left,black,sedan,excellent,2.0


In [8]:
test.Id.equals(sample_sub.Id)

True

In [9]:
test = test.drop('Id', axis=1)

In [10]:
TARGET = 'price'
train[TARGET].describe()

count     1642.0000
mean     15982.6334
std       7176.0846
min        462.0000
25%      12000.0000
50%      15750.0000
75%      18500.0000
max      87000.0000
Name: price, dtype: float64

# Data exploration & preprocessing

### price (TARGET)

In [11]:
train[TARGET].min(), train[TARGET].max()

(462, 87000)

In [12]:
train.loc[train[TARGET] < 3000]

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
78,mercedes-benz,1996,petrol,250000 km,left,black,sedan,normal,2.0,2100
196,mercedes-benz,1997,petrol,205000 km,left,blue,sedan,normal,2.0,2900
309,kia,2021,petrol,14900 km,left,gray,sedan,excellent,2.0,462


In [13]:
# dropping low outlier which seems to be a value error 
train = train[train[TARGET] > 2000]

In [14]:
train.loc[train[TARGET] > 40000]

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
87,mercedes-benz,2022,petrol,36500 km,left,gray,sedan,excellent,2.0,48000
137,hyundai,2023,petrol,6300 km,left,gray,sedan,excellent,2.0,40700
300,mercedes-benz,2020,petrol,15000 miles,left,white,Coupe,excellent,2.0,51000
456,mercedes-benz,2019,petrol,27000 miles,left,black,sedan,excellent,4.0,87000
558,mercedes-benz,2021,petrol,8800 km,left,black,sedan,new,2.0,78000
1257,mercedes-benz,2020,petrol,27000 miles,left,white,sedan,excellent,2.0,42000


High outliers do not look like erroneous records based on the feature values.

### model

In [15]:
f = 'model'

In [16]:
train[f].value_counts(normalize=True)

model
mercedes-benz    0.2450
hyundai          0.2139
nissan           0.2133
kia              0.1731
toyota           0.1548
Name: proportion, dtype: float64

In [17]:
test[f].value_counts(normalize=True)

model
mercedes-benz    0.2263
kia              0.2165
hyundai          0.2141
nissan           0.2068
toyota           0.1363
Name: proportion, dtype: float64

In [18]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
hyundai,351.0,14829.9573,4792.7294,7000.0,11650.0,14000.0,16450.0,40700.0
kia,284.0,14907.9718,3973.5794,5500.0,12275.0,15200.0,17200.0,27900.0
mercedes-benz,402.0,14728.8085,11519.031,2100.0,6000.0,11000.0,23875.0,87000.0
nissan,350.0,18284.8229,4255.0327,4375.0,15993.25,17788.5,19500.0,40000.0
toyota,254.0,17650.2913,5707.6232,3800.0,14225.0,16500.0,20975.0,37800.0


In [19]:
model_mapping = {
    'mercedes-benz': 0,
    'hyundai': 1,
    'kia': 2,
    'toyota': 3,
    'nissan': 4
}

train['model'] = train.model.replace(model_mapping).astype('int')
test['model'] = test.model.replace(model_mapping).astype('int')

In [20]:
train[f].corr(train[TARGET])

0.19838481345489223

### year

In [21]:
f = 'year'

In [22]:
train[f].describe()

count    1641.0000
mean     2014.8020
std         6.5878
min      1987.0000
25%      2013.0000
50%      2017.0000
75%      2019.0000
max      2023.0000
Name: year, dtype: float64

In [23]:
test[f].describe()

count     411.0000
mean     2014.9635
std         6.5389
min      1993.0000
25%      2014.0000
50%      2017.0000
75%      2019.0000
max      2023.0000
Name: year, dtype: float64

In [24]:
train[f].corr(train[TARGET])

0.6410261366103661

Higher value of year -> newer car -> higher price

### motor_type

In [25]:
f = 'motor_type'

In [26]:
train[f].value_counts(normalize=True)

motor_type
petrol            0.8647
gas               0.1005
petrol and gas    0.0329
diesel            0.0012
hybrid            0.0006
Name: proportion, dtype: float64

In [27]:
test[f].value_counts(normalize=True)

motor_type
petrol            0.8662
gas               0.0876
petrol and gas    0.0462
Name: proportion, dtype: float64

In [28]:
train = train[train[f].isin(['petrol', 'gas', 'petrol and gas'])]

In [29]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
motor_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gas,165.0,9741.1576,4493.3678,3300.0,5600.0,9000.0,13500.0,21000.0
petrol,1419.0,16895.9521,7070.8348,2100.0,13000.0,16000.0,19100.0,87000.0
petrol and gas,54.0,11309.7222,4722.7481,3000.0,7650.0,11250.0,15737.5,22000.0


In [30]:
motor_mapping = {
    'petrol': 0,
    'petrol and gas': 1,
    'gas': 2
}

train['motor_type'] = train.motor_type.replace(motor_mapping).astype('int')
test['motor_type'] = test.motor_type.replace(motor_mapping).astype('int')

In [31]:
train[f].corr(train[TARGET])

-0.3194579138516617

### running

In [32]:
f = 'running'

In [33]:
train[f].sample(5, random_state=SEED)

453     39000  miles
975     58000  miles
134       300000  km
1187    60000  miles
1283      102000  km
Name: running, dtype: object

In [34]:
def convert_miles_to_km(distance):
    km_per_mile = 1.609344
    if distance.endswith('miles'):
        return int(distance.split(' ')[0]) * km_per_mile
    else:
        return int(distance.split(' ')[0])
    
train['running'] = train.running.apply(convert_miles_to_km)
test['running'] = test.running.apply(convert_miles_to_km)

In [35]:
train[f].sample(5, random_state=SEED)

453      62764.416
975      93341.952
134     300000.000
1187     96560.640
1283    102000.000
Name: running, dtype: float64

In [36]:
train[f].corr(train[TARGET])

-0.5151750538213654

In [37]:
train[f].describe()

count    1.6380e+03
mean     1.1930e+05
std      9.6659e+04
min      1.0000e+01
25%      5.6812e+04
50%      9.9000e+04
75%      1.6091e+05
max      1.2517e+06
Name: running, dtype: float64

### wheel

In [38]:
f = 'wheel'

In [39]:
train[f].value_counts(normalize=True)

wheel
left    1.0
Name: proportion, dtype: float64

Only one unique value. Can be dropped as a feature due to zero variance.

In [40]:
train = train.drop('wheel', axis=1)
test = test.drop('wheel', axis=1)

### color

In [41]:
f = 'color'

In [42]:
train[f].unique(), test[f].unique()

(array(['skyblue', 'black', 'other', 'golden', 'blue', 'gray', 'silver',
        'white', 'clove', 'orange', 'red', 'green', 'cherry', 'brown',
        'beige', 'purple', 'pink'], dtype=object),
 array(['black', 'white', 'silver', 'blue', 'gray', 'other', 'brown',
        'red', 'green', 'orange', 'cherry', 'skyblue', 'clove', 'beige'],
       dtype=object))

In [43]:
train[f].value_counts()

color
black      540
white      406
silver     223
gray       187
blue       141
red         47
other       30
cherry      20
green       10
brown       10
golden       6
orange       5
beige        5
clove        3
purple       2
skyblue      2
pink         1
Name: count, dtype: int64

In [44]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
beige,5.0,11800.0,7515.6503,3500.0,5300.0,13800.0,14400.0,22000.0
black,540.0,17190.0352,8016.398,2100.0,12500.0,16100.0,19825.0,87000.0
blue,141.0,13430.3191,6640.4045,2900.0,7200.0,14500.0,16900.0,33500.0
brown,10.0,14685.0,3624.3045,11700.0,12700.0,13800.0,14800.0,24000.0
cherry,20.0,13086.2,7983.1418,3200.0,6375.0,12750.0,14924.75,34500.0
clove,3.0,8500.0,1228.8206,7600.0,7800.0,8000.0,8950.0,9900.0
golden,6.0,10241.6667,3915.663,5500.0,7475.0,10000.0,12675.0,15750.0
gray,187.0,17280.7487,6611.2627,3500.0,14000.0,16500.0,19000.0,48000.0
green,10.0,9014.1,6138.8978,3900.0,4475.0,5600.0,14187.5,20500.0
orange,5.0,18040.0,2794.2799,13500.0,17500.0,18700.0,20000.0,20500.0


In [45]:
def map_colors(color):
    if color in ('black', 'white', 'gray'):
        return 0  # most popular
    elif color in ('silver', 'blue'):
        return 1  # semi-popular
    else:
        return 2  # rare
    
train['color'] = train.color.apply(map_colors).astype('int')
test['color'] = test.color.apply(map_colors).astype('int')

In [46]:
train[f].corr(train[TARGET])

-0.2072553783093032

### type

In [47]:
f = 'type'

In [48]:
train[f].value_counts(normalize=True)

type
sedan                0.7772
suv                  0.1838
Universal            0.0226
hatchback            0.0092
Coupe                0.0067
minivan / minibus    0.0006
Name: proportion, dtype: float64

In [49]:
test[f].value_counts(normalize=True)

type
sedan        0.7786
suv          0.1630
Universal    0.0462
Coupe        0.0097
hatchback    0.0024
Name: proportion, dtype: float64

In [50]:
train = train[train[f] != 'minivan / minibus']

In [51]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Coupe,11.0,19181.8182,16051.0323,3800.0,7600.0,11000.0,31300.0,51000.0
Universal,37.0,18043.7297,3211.1894,8800.0,15991.0,17700.0,20500.0,25300.0
hatchback,15.0,14033.3333,5495.1494,3800.0,11900.0,14900.0,16750.0,25000.0
sedan,1273.0,15334.1194,7515.6911,2100.0,10800.0,14700.0,17500.0,87000.0
suv,301.0,18534.6179,4478.1443,4375.0,16000.0,17900.0,19700.0,40000.0


In [52]:
type_mapping = {
    'sedan': 0, 'hatchback': 0,
    'suv': 1, 'Coupe': 1, 'Universal': 1
}

train['type'] = train.type.replace(type_mapping).astype('int')
test['type'] = test.type.replace(type_mapping).astype('int')

In [53]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1288.0,15318.9705,7495.0145,2100.0,10800.0,14800.0,17500.0,87000.0
1,349.0,18502.9742,5078.9417,3800.0,16000.0,17800.0,19800.0,51000.0


In [54]:
train[f].corr(train[TARGET])

0.1819825470449954

### status

In [55]:
f = 'status'

In [56]:
train[f].value_counts(normalize=True)

status
excellent    0.7300
good         0.2040
normal       0.0354
crashed      0.0177
new          0.0128
Name: proportion, dtype: float64

In [57]:
train.groupby(f)[TARGET].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
crashed,29.0,13862.069,4513.9003,3100.0,12500.0,13100.0,15700.0,24500.0
excellent,1195.0,17426.1858,6753.5342,3800.0,13800.0,16500.0,19500.0,87000.0
good,334.0,11913.8623,5642.4556,3000.0,7600.0,11450.0,15500.0,37500.0
new,21.0,25080.9524,13318.9571,12100.0,19900.0,23000.0,25500.0,78000.0
normal,58.0,7864.6552,4529.0196,2100.0,4350.0,6000.0,10000.0,17000.0


In [58]:
status_mapping = {'new': 0, 'excellent': 1, 'good': 2, 'normal': 3, 'crashed': 4}

train['status'] = train.status.replace(status_mapping).astype('int')
test['status'] = test.status.replace(status_mapping).astype('int')

In [59]:
train[f].corr(train[TARGET])

-0.35362262889834556

### motor_volume

In [60]:
f = 'motor_volume'

In [61]:
train[f].unique()

array([2. , 3.2, 1.8, 2.5, 1.6, 3. , 1.5, 0.2, 1.4, 2.6, 2.4, 2.8, 4. ,
       0.3, 2.2, 3.5, 2.3, 1.3, 1.2])

In [62]:
train[f].value_counts()

motor_volume
2.0    1284
2.5     121
1.8     120
1.6      47
3.0      23
2.4       9
3.5       6
2.8       5
3.2       4
2.6       4
1.5       4
0.2       2
2.2       2
1.4       1
4.0       1
0.3       1
2.3       1
1.3       1
1.2       1
Name: count, dtype: int64

In [63]:
train[f].corr(train[TARGET])

0.001777466370221828

# Hyperparameter tuning

In [64]:
features = [f for f in test.columns]
cat_features = ['model', 'motor_type', 'color', 'type', 'status']

In [65]:
# LightGBM dataset
dtrain = lgb.Dataset(
    data=train[features],
    label=train[TARGET],
    feature_name=features,
    categorical_feature=cat_features)

In [66]:
base_params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'verbosity': -1,
    'n_jobs': -1,
    'deterministic': True,
    'random_state': SEED
}

In [67]:
early_stopping = lgb.early_stopping(
    stopping_rounds=100,
    first_metric_only=True,
    verbose=False,
    min_delta=1e-4)

In [68]:
BUDGET = 60 * 60 * 2
NUM_FOLDS = 7

tuner = lgb.LightGBMTunerCV(
    time_budget=BUDGET,
    optuna_seed=SEED,
    params=base_params,
    train_set=dtrain,
    num_boost_round=10000,
    folds=KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED),
    feature_name=features,
    categorical_feature=cat_features,
    callbacks=[early_stopping],
    seed=SEED)

[I 2024-04-28 04:33:07,057] A new study created in memory with name: no-name-5cd00353-12de-4946-816f-cd2e87341856


In [69]:
%%time
tuner.run()

feature_fraction, val_score: 2115.519510:  14%|#4        | 1/7 [01:19<07:55, 79.24s/it][I 2024-04-28 04:34:26,382] Trial 0 finished with value: 2115.519510167192 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 2115.519510167192.
feature_fraction, val_score: 2063.054101:  29%|##8       | 2/7 [01:52<04:20, 52.11s/it][I 2024-04-28 04:34:59,505] Trial 1 finished with value: 2063.054101271612 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 1 with value: 2063.054101271612.
feature_fraction, val_score: 2063.054101:  43%|####2     | 3/7 [02:39<03:19, 49.83s/it][I 2024-04-28 04:35:46,616] Trial 2 finished with value: 2080.7707374610254 and parameters: {'feature_fraction': 0.7}. Best is trial 1 with value: 2063.054101271612.
feature_fraction, val_score: 2063.054101:  57%|#####7    | 4/7 [03:24<02:24, 48.08s/it][I 2024-04-28 04:36:32,025] Trial 3 finished with value: 2080.7707374610254 and parameters: {'feature_fraction': 0.8}. Best is trial 1 with v

CPU times: user 20min 3s, sys: 20min 17s, total: 40min 21s
Wall time: 31min 8s





In [70]:
print(f'Best score: {tuner.best_score:.5f}')
print(f'Best hyperparameters:')
for k, v in tuner.best_params.items():
    print(f'{k:20} - {v}')

Best score: 1985.12300
Best hyperparameters:
objective            - regression
metric               - l1
learning_rate        - 0.01
boosting_type        - gbdt
force_row_wise       - True
verbosity            - -1
n_jobs               - -1
deterministic        - True
random_state         - 2024
feature_pre_filter   - False
lambda_l1            - 0.006729839084572564
lambda_l2            - 0.0031520042724419817
num_leaves           - 6
feature_fraction     - 0.8999999999999999
bagging_fraction     - 0.8985724258081224
bagging_freq         - 7
min_child_samples    - 50


# Cross-validation

In [71]:
def comp_metric(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def custom_cv(estimator, seed=SEED, verbose=True):
    X_test = test[features]
    
    oof_preds, test_preds = {}, {}
    scores = []

    cv = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train)):
        X_train, y_train = train[features].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[features].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping])

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [72]:
%%time
model = LGBMRegressor(**tuner.best_params, n_estimators=10000)
op, tp = custom_cv(model)

Fold # 0: 1897.67032 (1092 rounds)
Fold # 1: 1810.57678 ( 672 rounds)
Fold # 2: 1901.07885 (1337 rounds)
Fold # 3: 2263.82534 ( 980 rounds)
Fold # 4: 1909.46613 ( 679 rounds)
Fold # 5: 1845.78131 ( 602 rounds)
Fold # 6: 2297.57880 ( 679 rounds)

Avg score: 1989.42536 +/- 187.27058
OOF score: 1989.23712

CPU times: user 3.55 s, sys: 0 ns, total: 3.55 s
Wall time: 3.55 s


In [73]:
def create_submission_files(preds, notebook='00'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col]
        sub.to_csv(f'nb{notebook}_{col}.csv', index=False)

In [74]:
create_submission_files(tp)

In [75]:
!head nb00_mean.csv

Id,price
0,17542.875702776153
1,16818.511345517465
2,23533.398880564917
3,14226.752161175793
4,5648.17432276279
5,25219.07107164767
6,6037.605066101942
7,7097.757281052524
8,17331.913713058406
