In [251]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [252]:
directory = os.path.join('sample_data', 'california_housing_train.csv')

# if os.path.isfile(directory):
#   print("File di temukan")

In [253]:
dataset = pd.read_csv(directory)

In [254]:
from sklearn import preprocessing
from sklearn.metrics import r2_score, mean_squared_error
scaler = preprocessing.StandardScaler() 

dataset.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [255]:
correlation_matrix = dataset.corr()
corr = correlation_matrix['longitude'].sort_values(ascending=False)
correlation_dataframe = pd.DataFrame({'Column': corr.index,
                 'Correlation with longitude': corr.values})

correlation_dataframe

Unnamed: 0,Column,Correlation with longitude
0,longitude,1.0
1,population,0.101674
2,total_bedrooms,0.071802
3,households,0.059628
4,total_rooms,0.04701
5,median_income,-0.015485
6,median_house_value,-0.044982
7,housing_median_age,-0.11425
8,latitude,-0.925208


In [256]:
feature = dataset.drop(['median_house_value'], axis=1)
label = dataset['median_house_value'] 

feature_scaled = scaler.fit_transform(feature.values) 
label_scaled = scaler.fit_transform(label.values.reshape(-1,1)).flatten()

In [257]:
from sklearn.model_selection import train_test_split
feature_train, feature_test,label_train, label_test = train_test_split(feature_scaled, label_scaled, test_size = 0.01 ,random_state = 10)

**Random Forest Regressor**

In [258]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()

forest_reg.fit(feature_train, label_train)
r2_score(forest_reg.predict(feature_train), label_train)

0.972125730898032

In [259]:
r2_score(forest_reg.predict(feature_test), label_test)

0.8591105183267098

**Linear Regression**

In [260]:
from sklearn import linear_model

linear_reg = linear_model.LinearRegression()

linear_reg.fit(feature_train, label_train)
r2_score(linear_reg.predict(feature_train), label_train)

0.44053662839579744

In [261]:
r2_score(linear_reg.predict(feature_test), label_test)

0.4837062583039232

**XGBoost**

In [262]:
import xgboost
from xgboost import plot_importance
best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
best_xgb_model.fit(feature_train,label_train)
r2_score(best_xgb_model.predict(feature_train), label_train)



0.9477796229385983

In [263]:
r2_score(best_xgb_model.predict(feature_test), label_test)

0.8441248967550362

**Decission Tree Regressor**

In [264]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(feature_train,label_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [265]:
tree_reg.fit(feature_train, label_train)
r2_score(tree_reg.predict(feature_train), label_train)

0.9999999999999655

In [266]:
r2_score(linear_reg.predict(feature_test), label_test)

0.4837062583039232

**SVR**

In [267]:
from sklearn.svm import SVR

svr_reg = SVR()
svr_reg.fit(feature_train,label_train)
r2_score(svr_reg.predict(feature_train), label_train)

0.6864002939176577

In [268]:
r2_score(svr_reg.predict(feature_test), label_test)

0.720047303660692

**Transformed Target Regressor**

In [269]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import QuantileTransformer

regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), transformer=QuantileTransformer(n_quantiles=300, output_distribution='normal'))

regr_trans.fit(feature_train, label_train)
r2_score(regr_trans.predict(feature_train), label_train)

0.634701446460525

In [270]:
r2_score(regr_trans.predict(feature_test), label_test)

0.6602097465952543

Dari uji model regresi diatas, dapat di simpulkan bahwa **Random Forest Regressor merupakan model yang paling akurat** dibandingkan Linear Regressor, XGBoost, Decision Tree Regressor, SVR dan Transformed Target Regressor.

Tetapi dengan catatan **test_size adalah 0.01 (1%)**. Semakin kecil test_size yang di berikan, maka semakin akurat pula r2 yang di hasilkan.