In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
%pip install jcopml
from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

Dataset berasal dari:
https://www.kaggle.com/doaaalsenani/usa-cers-dataset

In [None]:
df = pd.read_csv('../input/usa-cers-dataset/USA_cars_datasets.csv')
df.drop(columns=['vin', 'lot', 'country','Unnamed: 0'], inplace=True)
df.head()

# Cek Missing Value

In [None]:
plot_missing_value(df)

# Feature engineering

### 1. Feature Extraction (Brand, Color, dan State)

Semua fitur yang memiliki value count dibawah 10 akan di convert menjadi 'other', bertujuan untuk mengurangi noise

In [None]:
bcs = df[['brand', 'color', 'state']]

for feature in bcs:
    print(bcs[feature].value_counts())

In [None]:
color = df.color.value_counts().head(12).index
state = df.state.value_counts().head(32).index 
brand = df.brand.value_counts().head(15).index

bcs = color.append([brand,state])
bcs

In [None]:
def modif(x):
    if x in bcs:
        return x
    else:
        return "other"
    
df.color = df.color.apply(modif)
df.state = df.state.apply(modif)
df.brand = df.brand.apply(modif)

### 2. Times Extraction

convert waktu menjadi menit

In [None]:
df['value']= df['condition'] .str.split(' ').str[0]
df['times']= df['condition'] .str.split(' ').str[1]
df.head(3)

In [None]:
def days_to_min(time):
    return int(time)*1440

def hours_to_min(time):
    return int(time)*60

In [None]:
extracted_times =pd.concat([df[df['times']=='days']['value'].apply(days_to_min),
           df[df['times']=='hours']['value'].apply(hours_to_min),
           df[df['times']=='minutes']['value'].astype(int)]).rename('minutes_left',inplace=True)

In [None]:
extracted_times

In [None]:
df=pd.concat([df,extracted_times],axis=1)
df['minutes_left'].fillna(-200,inplace=True)
df.drop(['condition','value','times'],axis=1,inplace=True)

df.head()

### 3. Year Extraction

ubah tahun kelahiran mobil menjadi usia

In [None]:
def modif_year(x):
    return 2021 - x

In [None]:
df['year'] = modif_year(df.year)

In [None]:
df.head()

### 4. Mileage Extraction

buat fitur baru mileage pertahun

In [None]:
df['miles/year']= df['mileage']/df['year']
df.head()

### 5. Splitting Numerical dan Categorical data

agar memudahkan proses analisis dan modeling

In [None]:
categorical = df[['brand', 'model', 'title_status', 'color', 'state']]
categorical.head()

In [None]:
numerical = df[['year', 'mileage', 'minutes_left', 'miles/year']]
numerical.head()

# EDA

### 1. Univariat Analysis (Target)

In [None]:
sns.distplot(df.price);

target skewed ke kanan

In [None]:
df.price.max()
df[df['price'] == 84900]

### 2. Correlation Matrix

In [None]:
corrmat = df.corr(method='pearson')
fig, ax = plt.subplots(figsize=(7,7))
sns.heatmap(corrmat , square=True, cmap='RdYlBu', annot=True);

semua fitur aman dari Multicollinearity, terdapat korelasi yang negatif... kita akan cari tahu selanjutnya

### 3. Bivariat Analysis (Target vs Year)

In [None]:
sns.jointplot(x=df.year, y=df.price);

In [None]:
sns.jointplot(x=df.year, y=df.price, kind='hex');

dari visualisasi diatas dapat disimpulkan semakin tua usia mobil semakin murah harganya. namun terdapat anomali (outliers) dimana harga mobil tua melambung tinggi (mobil antik), mobil apakah itu?

In [None]:
df[df.year == 48]

### 4. Multivariat Analysis

In [None]:
sns.pairplot(numerical);

# Model

### Dataset Splitting

In [None]:
X = df.drop(columns="price")
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Auto ML

In [None]:
from jcopml.automl import AutoRegressor

In [None]:
model = AutoRegressor(num_feature=numerical.columns, cat_feature=categorical.columns)

In [None]:
model.fit(X=X_train, y=y_train, cv=4, poly=True )

In [None]:
model.plot_results()

### Simple Model

In [None]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(add_indicator=True, strategy='median')), 
    ("scaler", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown = 'ignore'))
])

In [None]:
categorical = X_train[['brand', 'model', 'title_status', 'color', 'state']]
numerical = X_train[['year', 'mileage', 'minutes_left', 'miles/year']]

In [None]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical.columns), 
    ("categoric", categorical_pipeline, categorical.columns)
])

In [None]:
from xgboost import XGBRegressor

In [None]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameter = {
'algo__colsample_bytree'    : [0.6738017242196919],
'algo__gamma'               : [3],
'algo__learning_rate'       : [0.08798929749689022],
'algo__max_depth'           : [5],
'algo__n_estimators'        : [146],
'algo__reg_alpha'           : [0.7128188058401365],
'algo__reg_lambda'          : [1.1044350847124704],
'algo__subsample'           : [0.5806385987847482]
}

In [None]:
model = GridSearchCV(pipeline, parameter, cv=4, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))