<h1> Pembagian 2 Model | Random Forest & Kmeans </h1>

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("CarPrice_Assignment.csv")

# Pisahkan brand dari CarName
df[['CarBrand', 'CarModel']] = df['CarName'].str.split(' ', n=1, expand=True)

# Koreksi typo
brand_corrections = {
    'toyouta': 'toyota',
    'Nissan': 'nissan',
    'maxda': 'mazda',
    'vw': 'volkswagen',
    'vokswagen': 'volkswagen',
    'porcshce': 'porsche',
}
df['CarBrand'] = df['CarBrand'].replace(brand_corrections).str.lower()

# Drop kolom yang tidak relevan
df.drop(columns=['car_ID', 'CarName', 'CarModel'], inplace=True)

# Definisikan X dan y
X = df.drop(columns=['price'])
y = df['price']

# Kolom kategorikal
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessing: OneHotEncoder untuk kolom kategorikal
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

# Bangun pipeline model Random Forest
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Latih model
model_rf.fit(X_train, y_train)

# Evaluasi
y_pred = model_rf.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

MAE: 1280.3440975609753
RMSE: 1818.1088013623785
R²: 0.958128309443089


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import json

# Fitur numerik yang dipakai untuk clustering
fitur_cluster = ['price', 'horsepower', 'enginesize', 'curbweight', 'citympg', 'highwaympg']
df_cluster = df[fitur_cluster].dropna()

# Standarisasi
scaler_kmeans = StandardScaler()
data_scaled = scaler_kmeans.fit_transform(df_cluster)

# KMeans Clustering (5 klaster)
model_kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
cluster_labels = model_kmeans.fit_predict(data_scaled)
df['Cluster_Price'] = cluster_labels

# Mapping klaster ke label harga
cluster_means = df.groupby('Cluster_Price')['price'].mean().sort_values()
ordered_clusters = cluster_means.index.tolist()
price_label_mapping = {
    ordered_clusters[0]: 'murah sekali',
    ordered_clusters[1]: 'murah',
    ordered_clusters[2]: 'sedang',
    ordered_clusters[3]: 'mahal',
    ordered_clusters[4]: 'sangat mahal'
}
df['Price_Label'] = df['Cluster_Price'].map(price_label_mapping)