In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value, plot_residual
from jcopml.feature_importance import mean_score_decrease

import matplotlib.pyplot as plt
import seaborn as sns

# Import Datasets

In [2]:
bangalore = pd.read_csv("datasets/Bangalore.csv")
chennai = pd.read_csv("datasets/Chennai.csv")

# gabungkan file tersebut dan namai dengan df
df = pd.concat([bangalore, chennai])

# bikin index ulang.
df = df.reset_index()

# hapus colom index lama.
df.drop(columns="index", inplace=True)

# tampilkan 5 data
df.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,30000000,3340,JP Nagar Phase 1,4,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
3,8358000,1675,Doddanekundi,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,6845000,1670,Kengeri,3,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0


# Cek Data Bolong

In [None]:
df.isna().sum()

# Visualize
### Data Target

In [None]:
plt.figure(figsize=(10, 4))
ax = sns.distplot(df["Price"], bins=100)
plt.show()

### Data kolom Area

In [None]:
plt.figure(figsize=(10, 4))
ax = sns.distplot(df["Price"], bins=100, color='r')
plt.show()

# Datasets Splitting

In [3]:
X = df.drop(columns="Price")
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8976, 39), (2245, 39), (8976,), (2245,))

# Think Simple
### Preprocessor

In [4]:
from xgboost import XGBRegressor

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool',
                                               'LandscapedGardens', 'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames',
                                               'ShoppingMall', 'Intercom', 'SportsFacility', 'ATM', 'ClubHouse', 'School',
                                               '24X7Security', 'PowerBackup', 'CarParking', 'StaffQuarter', 'Cafeteria',
                                               'MultipurposeRoom', 'Hospital', 'WashingMachine', 'Gasconnection', 'AC', 'Wifi',
                                               "Children'splayarea", 'LiftAvailable', 'BED', 'VaastuCompliant', 'Microwave',
                                               'GolfCourse', 'TV', 'DiningTable', 'Sofa', 'Wardrobe', 'Refrigerator']),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

### Training

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [None]:
model = RandomizedSearchCV(pipeline, rsp.xgb_params , cv=3, n_iter=20, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Feature Importance

In [None]:
df_imp = mean_score_decrease(X_train, y_train, model, plot=True, topk=10)

# Reduce columns
### Preprocessor - Training

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'ATM', 'Cafeteria'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.xgb_params , cv=3, n_iter=30, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Tuning
### Preprocessor - Training

In [6]:
from jcopml.tuning.space import Integer, Real

In [None]:
parameter_tune = {
    'prep__numeric__poly__degree': Integer(low=1, high=3),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=1, high=10),
    'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
    'algo__n_estimators': Integer(low=100, high=200),
    'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
    'algo__gamma': Integer(low=1, high=10),
    'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
    'algo__reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
    'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')
}

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'ATM', 'Cafeteria'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, parameter_tune , cv=3, n_iter=40, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Tuning part 2

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, scaling='minmax', transform='yeo-johnson'), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'ATM', 'Cafeteria'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

parameter_tune = {
    'prep__numeric__poly__degree': Integer(low=1, high=3),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=1, high=10),
    'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
    'algo__n_estimators': Integer(low=100, high=200),
    'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
    'algo__gamma': Integer(low=1, high=10),
    'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
    'algo__reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
    'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')
}

model = RandomizedSearchCV(pipeline, parameter_tune , cv=3, n_iter=40, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Tuning part 3
Preprocessor - Training

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, scaling='standard', transform ='yeo-johnson'), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'ATM', 'Cafeteria'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

parameter_tune = {
    'prep__numeric__poly__degree': Integer(low=2, high=3),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=8, high=15),
    'algo__learning_rate': Real(low=-1, high=0, prior='log-uniform'),
    'algo__n_estimators': Integer(low=50, high=55),
    'algo__subsample': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__gamma': Integer(low=1, high=2),
    'algo__colsample_bytree': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__reg_alpha': Real(low=-1, high=1, prior='log-uniform'),
    'algo__reg_lambda': Real(low=-1, high=1, prior='log-uniform')
}

model = RandomizedSearchCV(pipeline, parameter_tune , cv=3, n_iter=80, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Tuning Part 4

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, scaling='robust', transform ='yeo-johnson'), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'ATM', 'Cafeteria'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

parameter_tune = {
    'prep__numeric__poly__degree': Integer(low=2, high=3),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__max_depth': Integer(low=8, high=15),
    'algo__learning_rate': Real(low=-1, high=0, prior='log-uniform'),
    'algo__n_estimators': Integer(low=50, high=55),
    'algo__subsample': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__gamma': Integer(low=3, high=4),
    'algo__colsample_bytree': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__reg_alpha': Real(low=-1, high=1, prior='log-uniform'),
    'algo__reg_lambda': Real(low=-1, high=1, prior='log-uniform')
}

model = RandomizedSearchCV(pipeline, parameter_tune , cv=3, n_iter=100, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

# Tuning Part 5

In [29]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, scaling='robust'), ["Area", "No. of Bedrooms"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool',
                                               'LandscapedGardens', 'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames',
                                               'ShoppingMall', 'Intercom', 'SportsFacility', 'ATM', 'ClubHouse', 'School',
                                               '24X7Security', 'PowerBackup', 'CarParking', 'StaffQuarter', 'Cafeteria',
                                               'MultipurposeRoom', 'Hospital', 'WashingMachine', 'Gasconnection', 'AC', 'Wifi',
                                               "Children'splayarea", 'LiftAvailable', 'BED', 'VaastuCompliant', 'Microwave',
                                               'GolfCourse', 'TV', 'DiningTable', 'Sofa', 'Wardrobe', 'Refrigerator']),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

In [30]:
pipeline.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('numeric',
                                    Pipeline(memory=None,
                                             steps=[('imputer',
                                                     SimpleImputer(add_indicator=False,
                                                                   copy=True,
                                                                   fill_value=None,
                                                                   missing_values=nan,
                                                                   strategy='median',
                                                                   verbose=0)),
                                                    ('poly',
                                                     PolynomialFeatures(degree=2,
                           

In [33]:
parameter_tune = {
    'prep__numeric__poly__degree': Integer(low=8, high=10),
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__booster': ['gbtree', 'gblinear', 'dart'],
    'algo__max_depth': Integer(low=8, high=9),
    'algo__learning_rate': [0.05047786565710607],
    'algo__n_estimators': Integer(low=90, high=100),
    'algo__subsample': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__gamma': Integer(low=0, high=2),
    'algo__colsample_bytree': Real(low=0.6, high=0.95, prior='uniform'),
    'algo__reg_lambda': Real(low=-4, high=-1, prior='log-uniform')
}

model = RandomizedSearchCV(pipeline, parameter_tune , cv=5, n_iter=7, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   37.1s finished


{'algo__booster': 'dart', 'algo__colsample_bytree': 0.8787900454010815, 'algo__gamma': 2, 'algo__learning_rate': 0.05047786565710607, 'algo__max_depth': 8, 'algo__n_estimators': 97, 'algo__reg_lambda': 0.006251373574521747, 'algo__subsample': 0.6546065241548528, 'prep__numeric__poly__degree': 10, 'prep__numeric__poly__interaction_only': True}
0.6072059023465419 0.3041821640132006 0.3598784597973561


# Evaluasi

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from luwiji.metrics import illustration

In [None]:
illustration.sklearn_scoring

In [None]:
plot_residual(X_train, y_train, X_test, y_test, model)

In [None]:
mean_squared_error(y_train, model.predict(X_train))

In [None]:
mean_squared_log_error(y_test, model.predict(X_test))

In [None]:
r2_score(y_test, model.predict(X_test))

In [None]:
y_train

In [None]:
model.predict(X_train)

In [None]:
model.score()