In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

import matplotlib.pyplot as plt
import seaborn as sns

# Import File

In [2]:
bangalore = pd.read_csv("datasets/Bangalore.csv")
chennai = pd.read_csv("datasets/Chennai.csv")

# gabungkan file tersebut dan namai dengan df
df = pd.concat([bangalore, chennai])

# bikin index ulang.
df = df.reset_index()

# hapus colom index lama.
df.drop(columns="index", inplace=True)

# tampilkan 5 data
df.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,30000000,3340,JP Nagar Phase 1,4,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
3,8358000,1675,Doddanekundi,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,6845000,1670,Kengeri,3,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0


In [3]:
features = ['MaintenanceStaff','Gymnasium','SwimmingPool','LandscapedGardens','JoggingTrack','RainWaterHarvesting',
            'IndoorGames','ShoppingMall','Intercom','SportsFacility','ATM','ClubHouse','School','24X7Security',
            'PowerBackup','CarParking','StaffQuarter','Cafeteria','MultipurposeRoom','Hospital','WashingMachine',
            'AC','Wifi','BED','VaastuCompliant','Microwave','GolfCourse','TV','DiningTable',
            'Sofa','Refrigerator']

df = df.drop(df[features], axis=1)
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,Gasconnection,Children'splayarea,LiftAvailable,Wardrobe
0,30000000,3340,JP Nagar Phase 1,4,0,0,1,1,0
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0,1,1,0
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0,1,1,0
3,8358000,1675,Doddanekundi,3,0,0,0,1,0
4,6845000,1670,Kengeri,3,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
11216,7834999,1599,Korattur,3,0,9,9,9,9
11217,2408000,740,Ambattur,2,0,9,9,9,9
11218,5500000,1700,Pallikaranai VGP Shanti Nagar,3,0,9,9,9,9
11219,3400000,1599,Korattur,3,0,9,9,9,9


# Datasets Splitting

In [4]:
X = df.drop(columns="Price")
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8976, 8), (2245, 8), (8976,), (2245,))

### Preprocessor

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [6]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling='robust', transform='yeo-johnson'), ["Area"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location", "No. of Bedrooms", "Resale", "Gasconnection", "Children'splayarea",
                                              "LiftAvailable", "Wardrobe"]),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestRegressor(n_jobs=-1, random_state=42))
])

  warn("Transformer has default standardization, so the scaling argument is neglected")


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

model = RandomizedSearchCV(pipeline, rsp.rf_params, cv=5, n_iter=10, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.0min finished


{'algo__max_depth': 22, 'algo__max_features': 0.1185260448662222, 'algo__min_samples_leaf': 2, 'algo__n_estimators': 187}
0.5013999665566935 0.3449231012653214 0.3265642181939256


In [8]:
rsp.rf_params

{'algo__n_estimators': Integer(low=100, high=200),
 'algo__max_depth': Integer(low=20, high=80),
 'algo__max_features': Real(low=0.1, high=1, prior='uniform'),
 'algo__min_samples_leaf': Integer(low=1, high=20)}

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [17]:
print(mean_absolute_error(y_test, model.predict(X_test)))
print(np.sqrt(mean_squared_error(y_test,model.predict(X_test))))
print(r2_score(y_test, model.predict(X_test)))

4137087.925936966
8933318.812836144
0.32656421819392567


In [13]:
mean_absolute_error(y_train, model.predict(X_train))

3964004.0303264293