In [1]:
import numpy as np
import pandas as pd

In [2]:
traindata_bounds = pd.read_csv('files/csv/traindata_bounds.csv')
traindata = pd.DataFrame({'location': [], 'lat': [], 'lng': [], 'price_per_m2': []})

In [3]:
for _, row in traindata_bounds.iterrows():
    location, price_per_m2, ne_lat, ne_lng, sw_lat, sw_lng = row
    width = ne_lng - sw_lng
    height = ne_lat - sw_lat
    a, b = max(width,height), min(width,height)
    ratio = np.divide(a, b)
    point_num = np.sqrt(np.divide(100,ratio))
    stride = np.divide(b, point_num)
    for lat in np.arange(sw_lat, ne_lat + stride, stride):
        for lng in np.arange(sw_lng, ne_lng + stride, stride):
            traindata.loc[len(traindata)] = [location, lat, lng, price_per_m2]

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
Y_train_scaled = Y_train/1e4
Y_test_scaled = Y_test/1e4

In [None]:
poly = PolynomialFeatures(degree=4).fit(X_train_scaled)
X_train_scaled = poly.transform(X_train_scaled)
X_test_scaled = poly.transform(X_test_scaled)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the model with appropriate hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',  # Specify the loss function (MSE)
    device="cuda",
    seed=42,
)

parameters = {
    'max_depth': range(2, 10, 1),           # Maximum depth of each tree
    'n_estimators': range(60, 220, 40),     # Number of boosting rounds (trees)
    'learning_rate': [0.1, 0.01, 0.05]      # Learning rate (step size for updates)
}

model_cv = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=10,
    verbose=3
)

model_cv.fit(X_train_scaled, Y_train_scaled)


In [None]:
print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:
model_cv.score(X_test_scaled, Y_test_scaled)

In [None]:
model_cv.device = "cpu"
Y_pred = model_cv.predict(X_test_scaled) * 1e4
val_error = Y_pred - Y_test

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

_, ax = plt.subplots(1,1, figsize=(12.5,3))
sns.kdeplot(ax=ax, data=val_error.reset_index(drop=True).sort_values())

plt.xlabel("price_per_m2 error")
plt.ylabel("Density")
plt.grid(True, alpha=0.6, linestyle="--")
plt.show()

In [None]:
Y_pred = model_cv.predict(X_test_scaled[300].reshape(1,15)) * 1e4
"{:,}".format(int(Y_pred[0])).replace(",", "."), "{:,}".format(int(Y_test[300])).replace(",", ".")