In [None]:
# imports
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# load dataset
data = pd.read_csv("AmesHousing.csv")

# Preprocessing dataset

In [None]:
data.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [None]:
print(data.columns)
print('There are ' + str(len(data.columns)) + ' columns in dataset')
print('There are ' + str(len(data)) + ' houses in dataset')

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [None]:
# Sale price is our target variable
# Check if there is any missing values
rows_with_nulls = data.isnull().any(axis=1).sum()
print(f"\nRows with at least one missing value: {rows_with_nulls}")

# Total number of rows
print(f"Total rows: {len(data)}")

# Percentage of rows affected
print(f"Percentage of rows with missing values: {rows_with_nulls / len(data) * 100:.2f}%")



Rows with at least one missing value: 2930
Total rows: 2930
Percentage of rows with missing values: 100.00%


In [None]:
# So every row has a missing value so we have to fill those things
# Before that let's remove some outliers based on our target variable so we can have more accurate predictions
import pandas as pd

def drop_outliers_iqr(df, numeric_cols):
    """
    Detects outliers using IQR method for numeric columns,
    returns indexes of outliers, and drops them from the DataFrame.

    Parameters:
        df (pd.DataFrame): input DataFrame
        numeric_cols (list): list of numeric column names to check for outliers

    Returns:
        cleaned_df (pd.DataFrame): DataFrame without outlier rows
        outlier_indexes (set): set of indexes that were dropped
    """
    outlier_indexes = set()

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        # Find outlier indexes for this column
        outliers_col = df[(df[col] < lower) | (df[col] > upper)].index
        outlier_indexes.update(outliers_col)

    # Drop all outliers
    cleaned_df = df.drop(index=outlier_indexes).reset_index(drop=True)

    return cleaned_df, outlier_indexes

# Example usage:
data, outlier_idx = drop_outliers_iqr(data, ['SalePrice'])
print(len(outlier_idx))
print(f"Number of outlier rows dropped: {len(outlier_idx)}")
print("Indexes of dropped rows:", outlier_idx)
print("Shape after dropping outliers:", data.shape)


137
Number of outlier rows dropped: 137
Indexes of dropped rows: {513, 1537, 521, 15, 17, 1559, 1050, 1051, 1052, 1053, 1055, 1056, 1057, 1058, 1059, 36, 1060, 38, 1063, 1064, 1067, 44, 1068, 46, 1070, 2096, 2097, 1074, 1587, 2099, 60, 65, 1102, 1105, 1106, 91, 1635, 1636, 1637, 1126, 1640, 1641, 1642, 2666, 1158, 1684, 1177, 1689, 1690, 1691, 1693, 1695, 1696, 1697, 1699, 1700, 1701, 2214, 1706, 1707, 1708, 1709, 2737, 2245, 1760, 1761, 1763, 1764, 2275, 1767, 1772, 1780, 1805, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2341, 321, 2883, 1860, 2379, 2380, 2382, 2384, 2901, 2902, 343, 2391, 2392, 347, 2395, 349, 2397, 2398, 2399, 2400, 366, 367, 2442, 2445, 2446, 1425, 1426, 2449, 2450, 2456, 2461, 421, 422, 423, 427, 429, 430, 431, 432, 433, 434, 436, 442, 956, 447, 448, 959, 456, 457, 968, 2522, 495, 497, 1010, 1012, 504}
Shape after dropping outliers: (2793, 82)


In [None]:
# Now with this new dataset, we can fill missing values
numeric_cols = data.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()

# filling numerical values with the median
num_imputer = SimpleImputer(strategy="median")
data[numeric_cols] = num_imputer.fit_transform(data[numeric_cols])

# filling categorical values as missing to discover if missing value has a predictive power
cat_imputer = SimpleImputer(strategy="constant", fill_value="Missing")
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

print("Missing values per column after imputation:")
print(data.isnull().sum().sum())  # should be 0

Missing values per column after imputation:
0


In [None]:
# setting target and feature variables
y = data["SalePrice"]
X = data.drop("SalePrice", axis=1)

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# One-hot encode all categorical features
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_ohe = pd.DataFrame(
    ohe.fit_transform(X[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X.index
)

# Drop original categorical columns and add one-hot encoded columns
X_numeric = X.drop(categorical_cols, axis=1)
X_final = pd.concat([X_numeric, X_ohe], axis=1)

X = X_final
print("Shape after one-hot encoding all categorical features:", X.shape)


Shape after one-hot encoding all categorical features: (2793, 321)


In [None]:
numeric_to_scale = ['Lot Frontage', 'Lot Area', 'Overall Qual',
       'Mas Vnr Area', 'BsmtFin SF 1','BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'TotRms AbvGrd','Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF',
       'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']

scaler = StandardScaler()
X[numeric_to_scale] = scaler.fit_transform(X[numeric_to_scale])

In [None]:
X.columns

Index(['Order', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area',
       'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
       'Mas Vnr Area',
       ...
       'Sale Type_New', 'Sale Type_Oth', 'Sale Type_VWD', 'Sale Type_WD ',
       'Sale Condition_Abnorml', 'Sale Condition_AdjLand',
       'Sale Condition_Alloca', 'Sale Condition_Family',
       'Sale Condition_Normal', 'Sale Condition_Partial'],
      dtype='object', length=321)

In [None]:
# let's remove some columns which does not hold any predictive power
columns_to_drop = ['Order', 'PID']  # add more if needed
X = X.drop(columns=columns_to_drop)
print("Shape after dropping columns:", X.shape)

Shape after dropping columns: (2793, 319)


In [None]:
# check that if any column has zero variance
variance = X.var()
zero_variance_cols = variance[variance == 0].index.tolist()

print("Columns with zero variance:", zero_variance_cols)

Columns with zero variance: []


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (2234, 319)
Test set shape: (559, 319)


# Training Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize model
lr_model = LinearRegression()

# Train on training set
lr_model.fit(X_train, y_train)

# Predict on training and test sets
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

def evaluate(y_true, y_pred, dataset_name="Dataset"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{dataset_name} Evaluation:")
    print(f"  MAE: {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R²: {r2:.4f}")
    print("----------------------------")

# Training performance
evaluate(y_train, y_train_pred, "Training")

# Test performance
evaluate(y_test, y_test_pred, "Test")


Training Evaluation:
  MAE: 10946.99
  RMSE: 15707.15
  R²: 0.9278
----------------------------
Test Evaluation:
  MAE: 13829.68
  RMSE: 26345.10
  R²: 0.8134
----------------------------
