In [1]:
import os
import sys

current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    project_root = os.path.abspath(os.path.join(current_dir, '..'))
else:
    project_root = current_dir

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to sys.path for module import.")

Added c:\Users\asus\OneDrive\Desktop\projects\house_price_prediction to sys.path for module import.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import re

In [3]:
from src.data_loader import load_and_initial_clean

In [4]:
DATA_PATH = '../data/AmesHousing.csv'

In [5]:
df = load_and_initial_clean(DATA_PATH)
if df is None:
    exit()
print("data loaded and cleaned initially using data_loader.py")

DataFrame columns standardized using a robust method in data_loader.py.
Dropped 'order' column.
Dropped 'pid' column.
Data loaded and initially cleaned. Shape: (2930, 80)
data loaded and cleaned initially using data_loader.py


In [6]:
y = df['saleprice']
X = df.drop('saleprice', axis=1)

In [7]:
y_log = np.log1p(y)
print("target variable 'saleprice' log-transformed to 'y_log'")

target variable 'saleprice' log-transformed to 'y_log'


In [8]:
X_processed = X.copy()

In [9]:
current_year = X_processed['yr_sold'].max() # Use max year sold for consistency
if 'year_built' in X_processed.columns:
    X_processed['house_age'] = current_year - X_processed['year_built']
    X_processed.drop('year_built', axis=1, inplace=True)
    print("Created 'house_age' and dropped 'year_built'.")
elif 'yr_built' in X_processed.columns: # Fallback if original column was 'yr_built'
    X_processed['house_age'] = current_year - X_processed['yr_built']
    X_processed.drop('yr_built', axis=1, inplace=True)
    print("Created 'house_age' and dropped 'yr_built'.")
else:
    print("Warning: 'year_built' or 'yr_built' not found for 'house_age' creation.")


Created 'house_age' and dropped 'year_built'.


In [10]:
gr_liv_area_col = 'gr_liv_area_log' if 'gr_liv_area_log' in X_processed.columns else 'gr_liv_area'
if 'overall_qual' in X_processed.columns and gr_liv_area_col in X_processed.columns:
    X_processed['overall_qual_gr_liv_area_inter'] = X_processed['overall_qual'] * X_processed[gr_liv_area_col]
    print(f"Created interaction term: 'overall_qual_gr_liv_area_inter'.")
else:
    print("Warning: Could not create interaction term (missing 'overall_qual' or 'gr_liv_area').")


Created interaction term: 'overall_qual_gr_liv_area_inter'.


In [11]:
total_bsmt_sf_col = 'total_bsmt_sf_log' if 'total_bsmt_sf_log' in X_processed.columns else 'total_bsmt_sf'
first_flr_sf_col = '1st_flr_sf_log' if '1st_flr_sf_log' in X_processed.columns else '1st_flr_sf'

if total_bsmt_sf_col in X_processed.columns and first_flr_sf_col in X_processed.columns:
    X_processed['total_flr_sf_combined'] = X_processed[total_bsmt_sf_col] + X_processed[first_flr_sf_col]
    print("Created combined floor area: 'total_flr_sf_combined'.")
else:
    print("Warning: Could not create 'total_flr_sf_combined' (missing basement or 1st floor SF).")


Created combined floor area: 'total_flr_sf_combined'.


In [12]:
numerical_features_initial = X_processed.select_dtypes(include=np.number).columns.tolist()


In [14]:
skewed_candidates = [col for col in numerical_features_initial if col not in ['id', 'overall_qual', 'overall_cond', 'mo_sold', 'yr_sold', 'ms_subclass', 'pool_area', 'misc_val']] # Add more non-skewed numericals to this exclusion list if needed
for feature in skewed_candidates:
    if feature in X_processed.columns and X_processed[feature].skew() > 0.75:
        if (X_processed[feature] >= 0).all(): # Ensure non-negative for log transform
            X_processed[f'{feature}_log'] = np.log1p(X_processed[feature])
            X_processed.drop(feature, axis=1, inplace=True)
numerical_features_for_pipeline = X_processed.select_dtypes(include=np.number).columns.tolist()
categorical_features_for_pipeline = X_processed.select_dtypes(include='object').columns.tolist()


In [15]:
none_cols_impute_before_pipeline = [
    'alley', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1',
    'bsmtfin_type_2', 'fireplace_qu', 'garage_type', 'garage_finish',
    'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature', 'mas_vnr_type'
]

for col in none_cols_impute_before_pipeline:
    if col in X_processed.columns and X_processed[col].isnull().any():
        X_processed[col] = X_processed[col].fillna('None')

In [16]:
ordinal_categories = {
    'lot_shape': ['ir3', 'ir2', 'ir1', 'reg'],
    'utilities': ['sev', 'no_sewr', 'no_pu', 'allpub'],
    'land_slope': ['sev', 'mod', 'gtl'],
    'exter_qual': ['po', 'fa', 'ta', 'gd', 'ex'],
    'exter_cond': ['po', 'fa', 'ta', 'gd', 'ex'],
    'bsmt_qual': ['none', 'po', 'fa', 'ta', 'gd', 'ex'],
    'bsmt_cond': ['none', 'po', 'fa', 'ta', 'gd', 'ex'],
    'bsmt_exposure': ['none', 'no', 'mn', 'av', 'gd'],
    'bsmtfin_type_1': ['none', 'unf', 'lwq', 'rec', 'blq', 'alq', 'glq'],
    'bsmtfin_type_2': ['none', 'unf', 'lwq', 'rec', 'blq', 'alq', 'glq'],
    'heating_qc': ['po', 'fa', 'ta', 'gd', 'ex'],
    'kitchen_qual': ['po', 'fa', 'ta', 'gd', 'ex'],
    'functional': ['sal', 'sev', 'maj2', 'maj1', 'mod', 'min2', 'min1', 'typ'],
    'fireplace_qu': ['none', 'po', 'fa', 'ta', 'gd', 'ex'],
    'garage_finish': ['none', 'unf', 'rfn', 'fin'],
    'garage_qual': ['none', 'po', 'fa', 'ta', 'gd', 'ex'],
    'garage_cond': ['none', 'po', 'fa', 'ta', 'gd', 'ex'],
    'paved_drive': ['n', 'p', 'y'],
    'pool_qc': ['none', 'fa', 'ta', 'gd', 'ex'],
    'fence': ['none', 'mnww', 'gdprv', 'mnprv', 'gdpry'],
    'ms_zoning': ['rh', 'rm', 'c(all)', 'fv', 'rl', 'a_agr', 'i(all)'],
    'street': ['grvl', 'pave'],
    'central_air': ['n', 'y']
}
numerical_cols_for_pipeline_input = X_processed.select_dtypes(include=np.number).columns.tolist()
categorical_cols_for_pipeline_input = X_processed.select_dtypes(include='object').columns.tolist()




In [17]:
standard_nominal_features = [
    'ms_subclass', 'bldg_type', 'house_style', 'roof_style', 'roof_matl',
    'exterior_1st', 'exterior_2nd', 'foundation', 'electrical', 'sale_type',
    'sale_condition'
]

# Ensure no overlap and all categorical_cols_for_pipeline_input are covered
final_ordinal_features = [col for col in ordinal_categories.keys() if col in categorical_cols_for_pipeline_input]
final_nominal_features = [col for col in categorical_cols_for_pipeline_input if col not in final_ordinal_features]

# Dynamically prepare categories for OrdinalEncoder
ordinal_encoder_categories_list = [ordinal_categories[f] for f in final_ordinal_features]


# Numerical Pipeline: Impute remaining numerical NaNs with median, then scale
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), # Ensures 'None' is handled if any new ones appear
    ('ordinal', OrdinalEncoder(categories=ordinal_encoder_categories_list, handle_unknown='use_encoded_value', unknown_value=-1))
])

# Nominal Pipeline
nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor combining all pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols_for_pipeline_input),
        ('ord', ordinal_pipeline, final_ordinal_features),
        ('nom', nominal_pipeline, final_nominal_features)
    ],
    remainder='passthrough' # Keep any other unhandled columns (e.g., 'id' if not dropped)
)
print("\nPreprocessor (ColumnTransformer) built with numerical, ordinal, and nominal pipelines.")



Preprocessor (ColumnTransformer) built with numerical, ordinal, and nominal pipelines.


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_log, test_size=0.2, random_state=42)

print(f"\nData split into training and testing sets.")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")



Data split into training and testing sets.
X_train shape: (2344, 81)
y_train shape: (2344,)
X_test shape: (586, 81)
y_test shape: (586,)
