<a href="https://colab.research.google.com/github/siska-nadila/IlmuData-Semester6/blob/main/tugas_ilmu_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Nama : Siska Nadila
# Npm  : 232103001

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load California Housing Dataset
california = fetch_california_housing(as_frame=True)
df_california = california.frame
print("California Housing Dataset:")
print(df_california.head())
print(df_california.shape)
print(df_california.columns.tolist())  # ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal'][web:11][page:2]

X_ca = df_california.drop('MedHouseVal', axis=1)
y_ca = df_california['MedHouseVal']

California Housing Dataset:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  
(20640, 9)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']


In [None]:
# Split data California (test_size=0.2, random_state=42 seperti query)
X_train_ca, X_test_ca, y_train_ca, y_test_ca = train_test_split(X_ca, y_ca, test_size=0.2, random_state=42)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_ca, y_train_ca)
lr_pred = lr_model.predict(X_test_ca)

print("Linear Regression R2 Score (California):", r2_score(y_test_ca, lr_pred))
print("MSE:", mean_squared_error(y_test_ca, lr_pred))


Linear Regression R2 Score (California): 0.5757877060324508
MSE: 0.5558915986952444


In [None]:
# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_ca, y_train_ca)
rf_pred = rf_model.predict(X_test_ca)
print("Random Forest R2 Score:", r2_score(y_test_ca, rf_pred))
print("RF MSE:", mean_squared_error(y_test_ca, rf_pred))

# SVM Regressor (RBF kernel)
svm_model = SVR(kernel='rbf', C=100, gamma=0.1)
svm_model.fit(X_train_ca, y_train_ca)  # Scale data jika perlu untuk SVM
svm_pred = svm_model.predict(X_test_ca)
print("SVM R2 Score:", r2_score(y_test_ca, svm_pred))
print("SVM MSE:", mean_squared_error(y_test_ca, svm_pred))

Random Forest R2 Score: 0.8051230593157366
RF MSE: 0.2553684927247781
SVM R2 Score: 0.15979791560973444
SVM MSE: 1.101008354921722


In [None]:
# Cross-Validation pada California Dataset
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=50, random_state=42),
    'SVR': SVR(kernel='rbf', C=10)
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    cv_scores = cross_val_score(model, X_ca, y_ca, cv=kf, scoring='r2')
    print(f"{name} CV R2 Scores: {cv_scores}")
    print(f"{name} CV R2 Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

LinearRegression CV R2 Scores: [0.57578771 0.61374822 0.60856043 0.62126494 0.5875292 ]
LinearRegression CV R2 Mean: 0.6014 (+/- 0.0340)
RandomForest CV R2 Scores: [0.80363858 0.81051251 0.80311062 0.81879224 0.80147766]
RandomForest CV R2 Mean: 0.8075 (+/- 0.0129)
SVR CV R2 Scores: [0.15347383 0.14177179 0.14312012 0.15741278 0.14979609]
SVR CV R2 Mean: 0.1491 (+/- 0.0119)
