In [1]:
import os

os.makedirs('./data', exist_ok=True)

!curl -L -o ./data/housing-prices-dataset.zip \
    https://www.kaggle.com/api/v1/datasets/download/yasserh/housing-prices-dataset
    
if os.path.exists('./data/housing-prices-dataset.zip'):
    !unzip -o ./data/housing-prices-dataset.zip -d ./data
    os.remove('./data/housing-prices-dataset.zip')
else:
    print("File download failed. Please check the URL or your internet connection.")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  4740  100  4740    0     0   6857      0 --:--:-- --:--:-- --:--:--  6857
Archive:  ./data/housing-prices-dataset.zip
  inflating: ./data/Housing.csv      


In [2]:
import pandas as pd

df = pd.read_csv('./data/Housing.csv')

print(df.head())
print(df.info())


      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 colu

In [3]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


**Numerical Features:**
- Price
- Area

**Categorical Features:**
- **Ordinal:**
  - Bedrooms
  - Bathrooms
  - Stories
  - Parking
- **One-Hot:**
  - Mainroad
  - Guestroom
  - Basement
  - Hotwaterheating
  - Airconditioning
  - Parking
  - Prefarea
  - Furnishingstatus


In [5]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# features
numerical_features = ['area']
categorical_features_ordinal = ['bedrooms', 'bathrooms', 'stories', 'parking']
categorical_features_onehot = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

In [31]:
# Transform

numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("rob_scaler", RobustScaler()),
        ("st_scaler", StandardScaler()),
    ]
)

categorial_transformer_ordinal = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "ordinal",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
    ]
)

categorical_transformer_onehot = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [32]:
# Apply Transformations using Column Transformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat_ordinal', categorial_transformer_ordinal, categorical_features_ordinal),
        ('cat_onehot', categorical_transformer_onehot, categorical_features_onehot)
    ]
)

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

models = {
    'RandomForest': RandomForestRegressor(),
    'SVR': SVR(),
    'LinearRegression': LinearRegression()
}

In [34]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["price"], axis=1), df[["price"]], test_size=0.2, random_state=42
)

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


results = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    # Define parameter grid for hyperparameter tuning
    if model_name == "RandomForest":
        param_grid = {
            "model__n_estimators": [10, 50, 100],
            "model__max_depth": [None, 10, 20],  # RandomForest-specific params
        }
    elif model_name == "SVR":
        param_grid = {
            "model__C": [0.1, 1, 10],
            "model__kernel": ["linear", "rbf"],  # SVR-specific params
        }
    elif model_name == "LinearRegression":
        param_grid = {"model__fit_intercept": [True, False]}  # For LinearRegression

    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring="neg_mean_squared_error"
    )

    grid_search.fit(X_train, y_train)

    results[model_name] = {
        "best_score": grid_search.best_score_,
        "best_params": grid_search.best_params_,
    }

for model_name, result in results.items():
    print(
        f"{model_name}: Best Score = {result['best_score']}, Best Params = {result['best_params']}"
    )

RandomForest: Best Score = -1256867837402.8118, Best Params = {'model__max_depth': 20, 'model__n_estimators': 100}
SVR: Best Score = -3252999917722.45, Best Params = {'model__C': 10, 'model__kernel': 'linear'}
LinearRegression: Best Score = -1140572613700.7793, Best Params = {'model__fit_intercept': True}


In [35]:
for model_name, result in results.items():
    print(
        f"{model_name}: Best Score = {result['best_score']}, Best Params = {result['best_params']}"
    )

RandomForest: Best Score = -1256867837402.8118, Best Params = {'model__max_depth': 20, 'model__n_estimators': 100}
SVR: Best Score = -3252999917722.45, Best Params = {'model__C': 10, 'model__kernel': 'linear'}
LinearRegression: Best Score = -1140572613700.7793, Best Params = {'model__fit_intercept': True}


In [None]:
numerical_transformer.fit_transform(df[numerical_features])

In [None]:
# X_train_transformed = preprocessor.fit_transform(X_train)
X_train

In [115]:
numerical_features_df = pd.DataFrame(
    data= numerical_transformer.fit_transform(df[numerical_features]),
    columns =  numerical_features
)

categorical_features_ordinal_df = pd.DataFrame(
    data= categorial_transformer_ordinal.fit_transform(df[categorical_features_ordinal])
)

categorical_features_onehot_df = pd.DataFrame(
    data= categorical_transformer_onehot.fit_transform(df[categorical_features_onehot])
)

In [None]:
categorical_features_ordinal_df

In [None]:
display(categorical_features_onehot_df)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_test, y_test)

In [None]:
from scipy.stats import f_oneway

df_transformed = preprocessor.fit_transform(df)

columns = numerical_features + \
          categorical_features_ordinal + \
          list(preprocessor.transformers_[2][1].named_steps['onehot'].get_feature_names_out(categorical_features_onehot))
df_transformed = pd.DataFrame(df_transformed, columns=columns)

In [None]:
df_transformed

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

correlation_matrix = df_transformed.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
from scipy import stats

# Group data by 'bedrooms' and collect 'price' values
grouped = [group['price'].values for name, group in df.groupby('bedrooms')]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*grouped)

# Display results
print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")


In [37]:
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", LinearRegression())])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)

0.6529242642153172

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [59]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# List of numerical and categorical columns
numerical_features = ["area", "bedrooms", "bathrooms", "stories", "parking"]
categorical_features_ordinal = [
    "mainroad",
    "guestroom",
    "basement",
    "hotwaterheating",
    "airconditioning",
]
categorical_features_onehot = ["prefarea", "furnishingstatus"]

# Define the transformers for each column type
numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer_ordinal = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "ordinal",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
        ),
    ]
)


categorical_transformer_onehot = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat_ordinal", categorical_transformer_ordinal, categorical_features_ordinal),
        ("cat_onehot", categorical_transformer_onehot, categorical_features_onehot),
    ]
)

# Example of fitting a pipeline with this preprocessor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split your dataset
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["price"], axis=1), df["price"], test_size=0.2, random_state=42
)

# Define the model
model = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)

# Fit the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

-0.101563919104108

In [60]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', xgb.XGBRegressor())])

# Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 5, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get results
best_model = grid_search.best_estimator_
print(f"Best Params: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")


Traceback (most recent call last):
  File "/home/oak/coding/practice/machine-learning/housing-regression/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/oak/coding/practice/machine-learning/housing-regression/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/oak/coding/practice/machine-learning/housing-regression/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/oak/coding/practice/machine-learning/housing-regression/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", lin

Best Params: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 50, 'regressor__subsample': 0.8}
Best Score: nan
