<a href="https://colab.research.google.com/github/rezatan/RumahJaksel/blob/main/RumahJaksel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [2]:
# Read the data
df = pd.read_csv('HargaRumahJaksel.csv')

In [3]:
df.head()

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS,KOTA
0,28000000000,1100,700,5,6,ADA,JAKSEL
1,19000000000,824,800,4,4,ADA,JAKSEL
2,4700000000,500,400,4,3,ADA,JAKSEL
3,4900000000,251,300,5,4,ADA,JAKSEL
4,28000000000,1340,575,4,5,ADA,JAKSEL


In [4]:
X = df.copy()
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['HARGA'], inplace=True)
y = X.HARGA           
X.drop(['HARGA'], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)


In [5]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

In [11]:
def get_mae(leaf_size, X_train, X_valid, y_train, y_valid):
    # Define the model
    model = XGBRegressor(n_estimators=leaf_size, learning_rate=0.001)

    # Fit the model
    model.fit(X_train, y_train) # Your code here

    # Get predictions
    predictions = model.predict(X_valid)

    # Calculate MAE
    mae = mean_absolute_error(predictions, y_valid)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

candidate_max_leaf_nodes = [1000, 1500, 2000, 3000, 4000, 5000]

# Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {leaf_size: get_mae(leaf_size, X_train, X_valid, y_train, y_valid) for leaf_size in candidate_max_leaf_nodes}

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)
print(scores)
print(best_tree_size)

{1000: 7115260140.577114, 1500: 5832562156.577114, 2000: 5379984840.915422, 3000: 5196018187.144279, 4000: 5215209121.432836, 5000: 5214171771.223881}
3000
