In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

In [4]:
housing = fetch_california_housing()

data = pd.DataFrame(housing.data, columns=housing.feature_names)

data['Price'] = housing.target

print(data.head())

   MedInc  HouseAge  AveRooms  AveBedrms  ...  AveOccup  Latitude  Longitude  Price
0  8.3252      41.0  6.984127   1.023810  ...  2.555556     37.88    -122.23  4.526
1  8.3014      21.0  6.238137   0.971880  ...  2.109842     37.86    -122.22  3.585
2  7.2574      52.0  8.288136   1.073446  ...  2.802260     37.85    -122.24  3.521
3  5.6431      52.0  5.817352   1.073059  ...  2.547945     37.85    -122.25  3.413
4  3.8462      52.0  6.281853   1.081081  ...  2.181467     37.85    -122.25  3.422

[5 rows x 9 columns]


In [5]:
y = data['Price']

x = data.drop('Price', axis=1)

print("X dan Y sudah dipisahkan")
print("Bentuk X:", x.shape)
print("Bentuk y:", y.shape)

X dan Y sudah dipisahkan
Bentuk X: (20640, 8)
Bentuk y: (20640,)


In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"Data latih (train): {x_train.shape[0]} baris")
print(f"Data tes (test): {x_test.shape[0]} baris")

Data latih (train): 16512 baris
Data tes (test): 4128 baris


In [7]:
model = LinearRegression()

model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
score = model.score(x_test, y_test)

print(f"Skor (R-squared) model pada data tes: {score:.4f}")

Skor (R-squared) model pada data tes: 0.5758


In [9]:
joblib.dump(model, 'california_housing_model.joblib')

['california_housing_model.joblib']

In [10]:
from sklearn.ensemble import RandomForestRegressor
import joblib

In [11]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

In [12]:
rf_model.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
score_rf = rf_model.score(x_test, y_test)

print(f"Skor (R-Squared) model Random Forest: {score_rf:.4f}")

Skor (R-Squared) model Random Forest: 0.8049


In [14]:
joblib.dump(rf_model, 'california_housing_model_rf.joblib')

['california_housing_model_rf.joblib']

In [15]:
from xgboost import XGBRegressor
import joblib

xgb_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

xgb_model.fit(x_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
score_xgb = xgb_model.score(x_test, y_test)

score_xgb

0.8301370561019205

In [17]:
joblib.dump(xgb_model, 'california_housing_model_xgb.joblib')

['california_housing_model_xgb.joblib']

In [22]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
import joblib

param_grid = {
    'learning_rate': [0.1, 0.05],
    'n_estimators': [100, 200],
    'max_depth': [5, 10]
}

lgbm = LGBMRegressor(random_state=42, n_jobs=-1)

grid_search = GridSearchCV(estimator=lgbm,
                           param_grid=param_grid,
                           cv=3,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=2)

grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838[LightGBM] [Info] Total Bins 1837

[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start training from score 2.078156
[LightGBM] [Info] Start training from score 2.073292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 183

0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_grid,"{'learning_rate': [0.1, 0.05], 'max_depth': [5, 10], 'n_estimators': [100, 200]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [23]:
print(grid_search.best_params_)

print(f"{grid_search.best_score_:.4f}")

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
0.8370


In [24]:
best_lgbm_model = grid_search.best_estimator_

final_score = best_lgbm_model.score(x_test, y_test)
print(f"{final_score:.4f}")

joblib.dump(best_lgbm_model, 'california_housing_model_best_lgbm.joblib')

0.8466


['california_housing_model_best_lgbm.joblib']