<a href="https://colab.research.google.com/github/inspire007/KaggleCompetitions/blob/main/Prediction_interval_competition_II_House_price/Prediction_interval_competition_II_House_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [11]:
def winkler_score(y_true, lower, upper, alpha=0.1):
    y_true = np.asarray(y_true)
    lower = np.asarray(lower)
    upper = np.asarray(upper)

    score = np.zeros_like(y_true, dtype=float)
    interval_width = upper - lower

    below = y_true < lower
    above = y_true > upper
    inside = ~below & ~above

    # Case 1: Inside the interval
    score[inside] = interval_width[inside]

    # Case 2: Below the interval
    score[below] = interval_width[below] + (2 / alpha) * (lower[below] - y_true[below])

    # Case 3: Above the interval
    score[above] = interval_width[above] + (2 / alpha) * (y_true[above] - upper[above])

    return score.mean()


In [12]:
data = pd.read_csv('dataset.csv')
y = data.iloc[:, 2].values
X = data.iloc[:, :]
X = X.drop(['id', 'sale_price', 'latitude', 'longitude'], axis=1)
X['sale_date'] = pd.to_datetime(X['sale_date'])
X['sale_date'] = X['sale_date'].astype('int64') // 10**9
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [13]:
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

ctx = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='passthrough')

X_train = ctx.fit_transform(X_train)
X_test = ctx.transform(X_test)

In [None]:
gbr = XGBRegressor(n_estimators=500, learning_rate=0.1)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

In [None]:
print(y_pred, y_test)

r2 = r2_score(y_test, y_pred)
print(f'r2 score for XGB: {r2}\n')

mse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'mse score for XGB: {mse}\n')

[ 310015.75  409381.84 1163057.2  ...  392817.4   905017.4   374788.44] [ 254920  400000 1125000 ...  395000  800000  447500]
r2 score for XGB: 0.9355276823043823

mse score for XGB: 105989.53608729495



In [14]:
X = ctx.fit_transform(X)

In [16]:
param_dist = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 10, 12],
    'min_child_weight': [1, 3, 5, 7, 10],
    'gamma': [0, 0.1, 0.3, 0.5, 1.0, 1.5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.001, 0.01, 0.1, 1, 10],
    'reg_lambda': [0, 0.01, 0.1, 1, 10, 100],
}

xgb = XGBRegressor(objective='reg:quantileerror', quantile_alpha=0.025)
rGrid = RandomizedSearchCV(
    estimator=xgb, param_distributions=param_dist, n_iter=20, verbose=1, cv=2, random_state=42, n_jobs=-1
)
rGrid.fit(X,y)
print(f'Best params: {rGrid.best_params_}')

Fitting 2 folds for each of 20 candidates, totalling 40 fits


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'subsample': 0.6, 'scale_pos_weight': 5, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 1000, 'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [18]:
gbr_low = XGBRegressor(objective='reg:quantileerror', quantile_alpha=0.025, subsample= 0.6, reg_lambda= 1, reg_alpha= 0, n_estimators= 1000, min_child_weight= 7, max_depth= 8, learning_rate= 0.2, gamma= 0.1, colsample_bytree= 1.0)
gbr_low.fit(X, y)

In [19]:
gbr_up = XGBRegressor(objective='reg:quantileerror', quantile_alpha=0.975, subsample= 0.6, reg_lambda= 1, reg_alpha= 0, n_estimators= 1000, min_child_weight= 7, max_depth= 8, learning_rate= 0.2, gamma= 0.1, colsample_bytree= 1.0)
gbr_up.fit(X, y)

In [None]:
#print(winkler_score(y_test, y_pred_lower, y_pred_upper))

376000.99179140624


In [20]:
test_data = pd.read_csv('test.csv')
X_t = test_data
X_t = X_t.drop(['id', 'latitude', 'longitude'], axis=1)
X_t['sale_date'] = pd.to_datetime(X_t['sale_date'])
X_t['sale_date'] = X_t['sale_date'].astype('int64') // 10**9
X_t = ctx.transform(X_t)

y_t_low = gbr_low.predict(X_t)
y_t_up = gbr_up.predict(X_t)

print(y_t_low, y_t_up)

[830149.8  527810.3  430041.56 ... 380596.06 464350.4  499473.53] [1115674.8   748730.44  854816.8  ...  541379.25  571207.56  662126.4 ]


In [21]:
np.savetxt('output.csv', np.column_stack((test_data['id'], y_t_low, y_t_up)), header='id,pi_lower,pi_upper', comments='', fmt=['%d', '%.2f', '%.2f'], delimiter=',')