In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv('/kaggle/input/housing-prices-dataset/Housing.csv')
df

# Data Walkthrough and EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype == 'int64']

In [None]:
for col in cat_cols:
    print('='*50)
    print(col)
    print('-'*50)
    print(df[col].value_counts())

In [None]:
cat_cols, num_cols

In [None]:
fig, axes = plt.subplots(5,3, figsize=(20,20))

for ax in axes.flatten():
    ax.set_axis_off()

for col, ax in zip(df.columns, axes.flatten()):
    ax.set_axis_on()
    sns.histplot(data=df, x=col, ax=ax)

In [None]:
fig, axes = plt.subplots(5,3, figsize=(20,20))

for ax in axes.flatten():
    ax.set_axis_off()

for col, ax in zip(df.columns, axes.flatten()):
    ax.set_axis_on()
    if col in cat_cols:
        sns.countplot(data=df, x=col, ax=ax, hue='furnishingstatus', palette='bright')
    else:
        sns.scatterplot(data=df, x=col, y='price', ax=ax, hue='furnishingstatus', palette='bright')

In [None]:
sns.pairplot(data=df, hue='furnishingstatus', palette='bright')

In [None]:
sns.boxplot(data=df)

In [None]:
sns.distplot(df['area'])

# Preprocessing

## 1. LabelEncoding

In [None]:
cat_cols

In [None]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

le = LabelEncoder()
for col in cat_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])
    print('*'*50)
    print(f'FEATURE :', col)
    print('='*50)
    print('BEFORE')
    print('-'*50)
    print(df[col].value_counts())
    print('-'*50)
    print('AFTER')
    print('-'*50)
    print(df_encoded[col].value_counts())

In [None]:
sns.boxplot()

# Baseline Modeling

## 1. Without Scaling

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

X = df_encoded.drop(['price'], axis=1)
y = df_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

svr = SVR(kernel='linear')
svr.fit(X_train, y_train)
y_preds = svr.predict(X_test)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))

In [None]:
svr = SVR()
svr.fit(X_train, y_train)
y_preds = svr.predict(X_test)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))
# Negative value of R2 means model is not useful

In [None]:
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_preds = rf.predict(X_test)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))

In [None]:
preds = pd.DataFrame({
    'y_test':y_test,
    'y_pred':y_preds
})
# preds['MA_pred'] = preds['y_pred'].rolling(window=100).mean()

sns.lineplot(data=preds)

In [None]:
plt.plot(np.arange(len(y_preds)), y_preds, linestyle = 'dotted', label='Prediction')
plt.plot(np.arange(len(y_preds)), y_test, label='Actual')

plt.legend()
plt.show()

In [None]:
preds['residual'] = preds['y_test'] - preds['y_pred']

sns.scatterplot(data=preds, y='residual', x=np.arange(len(preds['y_pred'])))
plt.hlines(y=0, xmin=0, xmax=len(preds['y_pred']), color='red')

## 2. Min-Max Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

sca = MinMaxScaler()
X_train_scaled = sca.fit_transform(X_train)
X_test_scaled = sca.transform(X_test)

In [None]:
svr = SVR(kernel='linear')
svr.fit(X_train_scaled, y_train)
y_preds = svr.predict(X_test_scaled)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))

In [None]:
svr = SVR()
svr.fit(X_train_scaled, y_train)
y_preds = svr.predict(X_test_scaled)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))

In [None]:
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train_scaled, y_train)
y_preds = rf.predict(X_test_scaled)

print('METRICS')
print('R2_SCORE :', r2_score(y_test, y_preds))
print('MSE :', mean_squared_error(y_test, y_preds))

## 3. One-Hot Encoding

In [None]:
df

In [None]:
df_onehot = df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=cat_cols)
df_onehot

In [None]:
X_train_one_hot, X_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(df_onehot.drop(['price'], axis=1), df_onehot['price'], random_state=0)

sca = MinMaxScaler()
X_train_one_hot = sca.fit_transform(X_train_one_hot)
X_test_one_hot = sca.transform(X_test_one_hot)

svr = SVR(kernel='linear')
svr.fit(X_train_one_hot, y_train_one_hot)
y_preds = svr.predict(X_test_one_hot)

print('METRICS')
print('R2_SCORE :', r2_score(y_test_one_hot, y_preds))
print('MSE :', mean_squared_error(y_test_one_hot, y_preds))

In [None]:
svr = SVR()
svr.fit(X_train_one_hot, y_train_one_hot)
y_preds = svr.predict(X_test_one_hot)

print('METRICS')
print('R2_SCORE :', r2_score(y_test_one_hot, y_preds))
print('MSE :', mean_squared_error(y_test_one_hot, y_preds))

In [None]:
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train_one_hot, y_train_one_hot)
y_preds = rf.predict(X_test_one_hot)

print('METRICS')
print('R2_SCORE :', r2_score(y_test_one_hot, y_preds))
print('MSE :', mean_squared_error(y_test_one_hot, y_preds))

In [None]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor(random_state=0, verbose=False)
cb.fit(X_train_one_hot, y_train_one_hot)
y_preds = cb.predict(X_test_one_hot)

print('METRICS')
print('R2_SCORE :', r2_score(y_test_one_hot, y_preds))
print('MSE :', mean_squared_error(y_test_one_hot, y_preds))

### Tuning CatBoost

In [None]:
import optuna
import numpy as np
from sklearn.datasets import make_regression
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Define the hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 10, 100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
    depth = trial.suggest_int("depth", 4, 10)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-9, 100, log=True)
    
    # Instantiate the CatBoostRegressor with suggested hyperparameters
    model = CatBoostRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        random_state=42,
        verbose=0
    )
    
    # Generate some example data (you can replace this with your own dataset)
    X, y = make_regression(n_samples=100, n_features=10, random_state=42)
    
    # Calculate cross-validated MSE scores
    mse_scores = -cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error')
    
    # Calculate the mean of MSE scores
    mse_mean = np.mean(mse_scores)
    
    # Return the mean MSE
    return mse_mean

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
best_params = trial.params

In [None]:
cb = CatBoostRegressor(random_state=0, verbose=False, **best_params)
cb.fit(X_train_one_hot, y_train_one_hot)
y_preds = cb.predict(X_test_one_hot)

print('METRICS')
print('R2_SCORE :', r2_score(y_test_one_hot, y_preds))
print('MSE :', mean_squared_error(y_test_one_hot, y_preds))

In [None]:
df_onehot.shape

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df_onehot.corr(), annot=True, cmap='Blues')