In [1]:
# Checking Dependencies

import sys
import subprocess
import importlib.util

print("Checking for necessary packages...")

# A dictionary of (package_name_on_pip, module_name_to_import)
packages = {
    'pandas': 'pandas',
    'numpy': 'numpy',
    'matplotlib': 'matplotlib',
    'seaborn': 'seaborn',
    'scikit-learn': 'sklearn',  # Note: pip name is scikit-learn, import name is sklearn
    'optuna': 'optuna'
}

missing_packages = []
for install_name, import_name in packages.items():
    spec = importlib.util.find_spec(import_name)
    if spec is None:
        print(f"'{install_name}' not found. Adding to installation list.")
        missing_packages.append(install_name)
    else:
        print(f"'{install_name}' is already installed.")

if not missing_packages:
    print("\nAll required packages are already installed!")
else:
    print(f"\nInstalling missing packages: {', '.join(missing_packages)}")
    try:
        # Use sys.executable to ensure pip is called from the correct Python interpreter
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing_packages])
        print("All missing packages installed successfully.")
    except Exception as e:
        print(f"Error during installation: {e}")
        print("Please try installing the packages manually.")

print("-" * 50)
print("Setup complete. You can now run the rest of your notebook.")

Checking for necessary packages...
'pandas' is already installed.
'numpy' is already installed.
'matplotlib' is already installed.
'seaborn' is already installed.
'torch' is already installed.
'scikit-learn' is already installed.
'optuna' is already installed.

All required packages are already installed! ðŸŽ‰
--------------------------------------------------
Setup complete. You can now run the rest of your notebook.


In [2]:
# Importing necessary libraries and setting seed

# Importing core libraries for data manipulation, visualization, and random operations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
import copy

# Importing scikit-learn for machine learning utilities such as metrics, model selection, and preprocessing.
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder, StandardScaler, power_transform
from sklearn.pipeline import Pipeline

# Importing Optuna for hyperparameter optimization and visualization of studies.
import optuna
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_param_importances

# Setting up necessary seeds for reproducibility across runs.
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# Load the dataset from 'data.csv' into a pandas DataFrame.
df = pd.read_csv('train.csv')

In [4]:
# Inspecting the shape of the DataFrame (number of rows, number of columns).
df.shape

(1460, 81)

In [5]:
# inspecting data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
# drop unnecessary features
df.drop(columns='Id',inplace=True)

In [7]:
# Separating features (X) and target (y), and applying a log transformation to the SalePrice (y).
X = df.iloc[:,:-1]
y = np.log(df.iloc[:,-1:])

In [8]:
X.shape, y.shape

((1460, 79), (1460, 1))

In [9]:
# Transforming the 'GarageYrBlt' feature by calculating 1 divided by (2025 - YearBuilt) to represent age inversely.
X.loc[:,'GarageYrBlt'] = 1/(2025-X.GarageYrBlt)

In [10]:
# Splitting the data into training, testing, and development sets.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=seed)
X_test, X_dev, y_test, y_dev = train_test_split(X_test,y_test,test_size=0.5,random_state=seed)

In [11]:
# Displaying the number of samples in the training, testing, and development sets.
X_train.shape[0],X_test.shape[0],X_dev.shape[0]

(876, 292, 292)

In [12]:
binary_imp = ['CentralAir'] # Features to be imputed with a constant 'N' (binary)
mode_imp = ['Electrical'] # Features to be imputed with the most frequent value
# Features to be imputed with a constant 'NA' (object type columns, excluding mode_imp and binary_imp)
na_imp = X_train.select_dtypes('object').columns.drop([*mode_imp,*binary_imp])
zero_imp = X_train.select_dtypes('number').columns # Numerical features to be imputed with 0

# Defining a ColumnTransformer for imputation strategies based on feature types.
impute = ColumnTransformer([
    ('na_imp',SimpleImputer(strategy='constant',fill_value='NA'), na_imp),
    ('zero_imp',SimpleImputer(strategy='constant',fill_value=0), zero_imp),
    ('mode_imp',SimpleImputer(strategy='most_frequent'), mode_imp),
    ('binary_imp',SimpleImputer(strategy='constant',fill_value='N'),binary_imp)
    ], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [13]:
# Defining lists of features based on their type and encoding strategy.
ord_feat = ['LotShape','Utilities','LandSlope','BldgType','ExterQual','ExterCond',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
            'HeatingQC','Electrical','KitchenQual','FireplaceQu','GarageFinish',
            'GarageQual','GarageCond','PavedDrive','PoolQC','Fence'] # Ordinal features
num_feat = X_train.select_dtypes('number').columns # Numerical features
bi_feat=['CentralAir'] # Binary features
# Categorical features (excluding ordinal, numerical, and binary)
cat_feat = X_train.columns.drop([*ord_feat,*num_feat,*bi_feat])

# Dictionary defining the order of categories for ordinal encoding.
ord_dic = dict(
    LotShape = ['NA','Reg', 'IR1', 'IR2', 'IR3'],
    Utilities =['NA','AllPub', 'NoSeWr','NoSeWa','ELO'],
    LandSlope =['NA','Gtl', 'Mod', 'Sev'],
    BldgType =['NA','1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
    BsmtExposure = ['NA','No', 'Gd','Av','Mn'],
    BsmtFinType1 = ['NA','GLQ', 'ALQ', 'BLQ',  'Rec', 'LwQ','Unf',],
    BsmtFinType2 = ['NA','GLQ', 'ALQ', 'BLQ',  'Rec', 'LwQ','Unf',],
    GarageFinish = ['NA','Fin','RFn','Unf'],
    PavedDrive = ['NA','Y', 'N', 'P'],
    Fence = ['NA','MnPrv', 'GdWo', 'GdPrv', 'MnWw'],
    Electrical = ['NA','SBrkr','FuseA','FuseF','FuseP','Mix'],
    rest =['NA','Ex','Gd','TA', 'Fa','Po'] # Default order for other ordinal features
)

# Creating a list of category orders for the OrdinalEncoder based on ord_dic.
categories = [ord_dic[col] if col in ord_dic.keys() else ord_dic['rest'] for col in ord_feat]

# Defining a ColumnTransformer for encoding different types of features.
encode = ColumnTransformer([
    ('oe', OrdinalEncoder(categories = categories,handle_unknown='use_encoded_value',unknown_value=-1),ord_feat), # Ordinal encoding
    ('te', TargetEncoder(cv=10,shuffle=True,random_state=seed),cat_feat), # Target encoding for other categorical features
    ('oe_bi', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),bi_feat) # Ordinal encoding for binary features
],
    remainder='passthrough',
    verbose_feature_names_out=False
                          )

In [14]:
# Preprocessing pipeline
preprocess = Pipeline([
    ('impute',impute),
    ('encoded',encode),
    ('scaling',StandardScaler().set_output(transform='pandas'))
]).set_output(transform='pandas')

# Applying the preprocessing pipeline to the training, development, and test datasets.
X_train = preprocess.fit_transform(X_train,y_train)
X_dev = preprocess.transform(X_dev)
X_test = preprocess.transform(X_test)

  y = column_or_1d(y, warn=True)


In [15]:
# export training, and test dataset to csv
X_train.to_csv('X_train.csv',index=False)
y_train.to_csv('y_train.csv',index=False)

X_dev.to_csv('X_dev.csv',index=False)
y_dev.to_csv('y_dev.csv',index=False)

X_test.to_csv('X_test.csv',index=False)
y_test.to_csv('y_test.csv',index=False)