In [None]:
import pandas as pd

# Ganti path ini sesuai dengan lokasi file Anda jika tidak sama
file_path = 'King County House Price.csv'

# Membaca file CSV
df = pd.read_csv(file_path)

# Menampilkan 5 baris pertama
df.head()

In [None]:
df.tail()

In [None]:
print(df.shape)        # cek jumlah baris dan kolom data asli

In [None]:
# Tampilkan kolom yang tersisa
df.columns.tolist()

In [None]:
df.info()

In [None]:
import pandas as pd

# Load data
df = pd.read_csv("King County House Price.csv")

# 1. Cek data hilang
print("=== Cek Data Hilang ===")
print(df.isnull().sum())
print("\n")

# 2. Cek data duplikat
print("=== Cek Data Duplikat ===")
duplicate_rows = df[df.duplicated()]
print(f"Jumlah data duplikat: {duplicate_rows.shape[0]}")
if not duplicate_rows.empty:
    print(duplicate_rows.head())
print("\n")

# 3. Cek data tidak konsisten
print("=== Cek Data Tidak Konsisten ===")

# Bedrooms <= 0
bedroom_issues = df[df['bedrooms'] <= 0]
print(f"Jumlah baris dengan bedrooms <= 0: {bedroom_issues.shape[0]}")
if not bedroom_issues.empty:
    print(bedroom_issues[['id', 'bedrooms']].head())

# Bathrooms <= 0
bathroom_issues = df[df['bathrooms'] <= 0]
print(f"\nJumlah baris dengan bathrooms <= 0: {bathroom_issues.shape[0]}")
if not bathroom_issues.empty:
    print(bathroom_issues[['id', 'bathrooms']].head())

# sqft_living <= 0
sqft_living_issues = df[df['sqft_living'] <= 0]
print(f"\nJumlah baris dengan sqft_living <= 0: {sqft_living_issues.shape[0]}")

# sqft_lot <= 0
sqft_lot_issues = df[df['sqft_lot'] <= 0]
print(f"Jumlah baris dengan sqft_lot <= 0: {sqft_lot_issues.shape[0]}")

# Tahun dibangun/renovasi di masa depan
future_built = df[df['yr_built'] > 2016]
print(f"Jumlah baris dengan yr_built > 2016: {future_built.shape[0]}")

future_renovated = df[df['yr_renovated'] > 2016]
print(f"Jumlah baris dengan yr_renovated > 2016: {future_renovated.shape[0]}")

In [None]:
columns_to_drop = ['date', 'id', 'zipcode']
df = df.drop(columns=columns_to_drop)

In [None]:
df.describe()

In [None]:
#Heatmap Korelasi
import matplotlib.pyplot as plt
import seaborn as sns

# Set style default
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (0.1, 0.5)

# Korelasi antar Fitur
plt.figure(figsize=(14, 12))
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matriks Korelasi Fitur")
plt.show()

In [None]:
#Feature Engineering
df['was_renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) #Pernah Renovasi atau Tidak
#Usia Renovasi
df['renovation_age'] = df.apply(
    lambda row: 2015 - row['yr_renovated'] if row['yr_renovated'] > 0 else 0,
    axis=1
)
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['has_basement'] = df['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)

#Hapus yr_built dan yr_renovated
columns_to_drop = ['yr_built', 'yr_renovated']
df = df.drop(columns=columns_to_drop)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Subset dua kolom yang ingin dikorelasikan
subset_df = df[["price", "was_renovated", "renovation_age", "total_rooms", "has_basement"]]

# Hitung korelasi
correlation = subset_df.corr()

# Plot heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matriks Korelasi antara Harga dengan Feature Engineering")
plt.show()

In [None]:
# Susunan kolom baru
new_column_order = [
    'price',
    'bedrooms',
    'bathrooms', 'total_rooms',  # total_rooms tepat di samping bathrooms
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above', 'has_basement',  # has_basement di samping sqft_above
    'sqft_basement',
    'lat',
    'long',
    'sqft_living15',
    'sqft_lot15',
    'was_renovated', 
    'renovation_age'
]

# Terapkan urutan baru ke DataFrame
df = df[new_column_order]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Heatmap Korelasi Setelah Feature Engineering
import matplotlib.pyplot as plt
import seaborn as sns

# Set style default
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (0.1, 0.5)

# Korelasi antar Fitur
plt.figure(figsize=(14, 12))
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matriks Korelasi Fitur")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Hitung korelasi antar fitur numerik
correlation_matrix = df.corr(numeric_only=True)

# Ambil korelasi terhadap 'price' dan hilangkan 'price' itu sendiri
correlation_with_price = correlation_matrix['price'].drop('price').sort_values(ascending=False)

# Plot horizontal bar chart
plt.figure(figsize=(10,6))
sns.barplot(x=correlation_with_price.values, y=correlation_with_price.index, color='steelblue')
plt.title("Fitur dengan Korelasi Tertinggi Terhadap Harga", fontsize=14)
plt.xlabel("Korelasi")
plt.ylabel("")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Pilih hanya kolom numerik
numeric_cols = df.select_dtypes(include=np.number).columns

# Atur ukuran grid (misalnya 4 kolom)
n_cols = 4
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

plt.figure(figsize=(n_cols*5, n_rows*4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f'Distribusi {col}')
    plt.tight_layout()

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Pilih kolom numerik
numeric_cols = df.select_dtypes(include=np.number).columns

# Grid layout
n_cols = 4
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

plt.figure(figsize=(n_cols * 5, n_rows * 4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=df[col], color='green')
    plt.title(f'Boxplot {col}')
    plt.tight_layout()

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Ambil semua kolom numerik
numeric_cols = df.select_dtypes(include=np.number).columns

# Buat pairplot semua kolom numerik (gunakan sample jika data besar)
sns.pairplot(df[numeric_cols], diag_kind='kde', plot_kws={'alpha': 0.5, 's': 10})
plt.suptitle("Pairplot untuk Hubungan Semua Fitur", y=1.02, fontsize=16)
plt.show()

In [None]:
df_clean = df.copy()

# Filter Outlier
df_clean = df_clean[df_clean['bedrooms'] <= 10]             
df_clean = df_clean[df_clean['bathrooms'] <= 6]            
df_clean = df_clean[df_clean['sqft_living'] <= 8000]       
df_clean = df_clean[df_clean['sqft_basement'] <= 2400]      
df_clean = df_clean[df_clean['total_rooms'] <= 13.5]         
df_clean = df_clean[df_clean['grade'] >= 4]
df_clean = df_clean[df_clean['lat'] >= 47.2]
df_clean = df_clean[df_clean['price'] <= 2750000]          
df_clean = df_clean[df_clean['sqft_above'] <= 5500]         
df_clean = df_clean[df_clean['sqft_living15'] <= 5000]
df_clean = df_clean[df_clean['floors'] <= 3]
df_clean = df_clean[df_clean['view'] <= 3]
df_clean = df_clean[df_clean['sqft_lot'] <= 100000]
df_clean = df_clean[df_clean['renovation_age'] <= 60]

print("Ukuran data setelah filter lengkap:", df_clean.shape)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Ambil semua kolom numerik
numeric_cols = df.select_dtypes(include=np.number).columns

# Setup grid
n_cols = 6  # Kolom: sebelum & sesudah
n_rows = len(numeric_cols)

plt.figure(figsize=(12, n_rows * 3))

for i, col in enumerate(numeric_cols):
    # Boxplot sebelum outlier removal
    plt.subplot(n_rows, n_cols, 2*i + 1)
    sns.boxplot(y=df[col], color='salmon')
    plt.title(f'{col} - Sebelum')

    # Boxplot sesudah outlier removal
    plt.subplot(n_rows, n_cols, 2*i + 2)
    sns.boxplot(y=df_clean[col], color='lightgreen')
    plt.title(f'{col} - Sesudah')

plt.tight_layout()
plt.suptitle('Perbandingan Boxplot Sebelum vs Sesudah Outlier Dihapus', y=1.02, fontsize=16)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Ambil semua kolom numerik dari df_clean
numeric_cols = df_clean.select_dtypes(include=np.number).columns

# Buat pairplot
sns.pairplot(df_clean, diag_kind='kde', plot_kws={'alpha': 0.5, 's': 10})
plt.suptitle("Pairplot Fitur Setelah Outlier Dihapus", y=1.02, fontsize=16)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Ambil semua kolom numerik kecuali 'price'
numeric_cols = df_clean.select_dtypes(include=np.number).columns.drop('price')

# Atur layout grid
n_cols = 4
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

plt.figure(figsize=(n_cols * 5, n_rows * 4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.scatterplot(x=df_clean[col], y=df_clean['price'], alpha=0.5, s=10, color='blue')
    plt.title(f'{col} vs Price')
    plt.xlabel(col)
    plt.ylabel('Price')

plt.tight_layout()
plt.suptitle("Scatter Plot Semua Fitur terhadap Harga", y=1.02, fontsize=16)
plt.show()

In [None]:
#Model

In [None]:
# Target kolom
target_col = 'price'

# Fitur: semua kolom kecuali kolom target
feature_cols = [col for col in df.columns if col != target_col]

# Konfirmasi fitur yang digunakan
print("Fitur yang digunakan:", feature_cols)

In [None]:
import matplotlib.pyplot as plt
import math

# Ambil semua fitur dari X
all_features = X.columns.tolist()
n_features = len(all_features)

# Tentukan ukuran grid
cols = 4  # jumlah kolom plot
rows = math.ceil(n_features / cols)

# Ukuran figure
plt.figure(figsize=(cols * 6, rows * 4))

# Plot setiap fitur terhadap price
for idx, feature in enumerate(all_features):
    plt.subplot(rows, cols, idx + 1)
    plt.scatter(X_train[feature], y_train, color='blue', alpha=0.8, label='Train')
    plt.scatter(X_test[feature], y_test, color='orange', alpha=0.2, label='Test')
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.title(f'{feature} vs Price')
    plt.legend()
    plt.tight_layout()

plt.suptitle("Visualisasi Setiap Fitur terhadap Harga (Training vs Testing)", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Fitur dan target
X = df_clean[feature_cols]
y = df_clean['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fungsi evaluasi
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, mse, rmse, r2, mape

# Simpan hasil dalam list of dict
results = []

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoost': GradientBoostingRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae, mse, rmse, r2, mape = evaluate_model(y_test, pred)
    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAPE (%)': mape
    })

# Buat DataFrame dan urutkan
df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values(by=['R2', 'MAE'], ascending=[False, True]).reset_index(drop=True)

# Tampilkan hasil akhir
print("=== Evaluasi Model - Urut berdasarkan R² (↓ MAE jika R² sama) ===")
print(df_sorted.to_string(index=False, formatters={
    'MAE': '{:,.2f}'.format,
    'MSE': '{:,.2f}'.format,
    'RMSE': '{:,.2f}'.format,
    'R2': '{:.4f}'.format,
    'MAPE (%)': '{:.2f}'.format
}))

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Siapkan pool untuk early stopping
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

# Parameter space terbaik
param_grid = {
    'depth': 8,
    'learning_rate': 0.03,
    'iterations': 3000,  # early stopping akan menghentikan lebih awal
    'l2_leaf_reg': 3,
    'random_strength': 1,
    'bagging_temperature': 0.5,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 100,
    'verbose': 100
}

# Model dengan tuning terbaik
model = CatBoostRegressor(**param_grid)
model.fit(train_pool, eval_set=test_pool, use_best_model=True)

# Evaluasi di test set
y_pred = model.predict(X_test)

def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, mse, rmse, r2, mape

mae, mse, rmse, r2, mape = evaluate_model(y_test, y_pred)

# Hasil
print("\n=== CatBoost Tuned (Early Stopping) ===")
print(f"MAE : {mae:,.2f}")
print(f"MSE : {mse:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²  : {r2:.4f}")
print(f"MAPE: {mape:.2f} %")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Hitung metrik
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.5, s=20, color='blue')

# Garis ideal
max_val = max(y_test.max(), y_pred.max())
plt.plot([0, max_val], [0, max_val], color='red', linestyle='--', label='Ideal')

# Judul dan label
plt.xlabel('Harga Aktual')
plt.ylabel('Harga Prediksi')
plt.title('Harga Aktual vs Prediksi\n')

# Tampilkan metrik di plot
textstr = '\n'.join((
    f'R²   = {r2:.4f}',
    f'MAE  = {mae:,.0f}',
    f'MSE  = {mse:,.0f}',
    f'MAPE = {mape:.2f} %'
))
plt.gcf().text(0.65, 0.15, textstr, fontsize=10, bbox=dict(boxstyle="round,pad=0.3", facecolor='white', edgecolor='gray'))

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()