In [2]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test (1).csv
Saving train.csv to train.csv


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [5]:
# Final Clean Data Pipeline (Best of Both Approaches)

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Set seed
np.random.seed(42)

In [6]:
# --- FUNCTIONS ---
def handle_missing_values(df):
    numerical_features = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
    categorical_none = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
                        'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

    for feature in numerical_features:
        df[feature] = df[feature].fillna(df[feature].median())

    for feature in categorical_none:
        df[feature] = df[feature].fillna("None")

    categorical_mode = df.select_dtypes(include=['object']).columns
    for feature in categorical_mode:
        if feature not in categorical_none and feature != 'is_train':
            df[feature] = df[feature].fillna(df[feature].mode()[0])

    return df

In [7]:
def log_transform_skewed_features(df):
    numeric_feats = df.select_dtypes(include=['int64', 'float64']).columns
    skewed_feats = df[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
    skewed = skewed_feats[abs(skewed_feats) > 0.75]
    for feat in skewed.index:
        df[feat] = np.log1p(df[feat])
    return df

In [8]:
def engineer_features(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])
    df['Age'] = df['YrSold'] - df['YearBuilt']
    df['IsRemodeled'] = (df['YearRemodAdd'] != df['YearBuilt']).astype(int)
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
    df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    df['QualityScore'] = df['OverallQual'] * df['OverallCond']
    return df

In [9]:
def encode_categoricals(df):
    return pd.get_dummies(df, drop_first=True)


In [10]:
def scale_numerical_strict(df):
    scaler = StandardScaler()
    exclude = ['is_train', 'SalePrice', 'Id']
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    numeric_cols = [col for col in numeric_cols if col not in exclude]
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

In [11]:
# --- LOAD DATA ---
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train['is_train'] = 1
test['is_train'] = 0
test['SalePrice'] = np.nan

In [12]:
# --- PROCESSING PIPELINE ---
combined = pd.concat([train, test], sort=False)
combined = handle_missing_values(combined)
combined = engineer_features(combined)
combined = log_transform_skewed_features(combined)
combined = encode_categoricals(combined)
combined = scale_numerical_strict(combined)


In [13]:
# --- SPLIT BACK ---
train_processed = combined[combined['is_train'] == 1].drop('is_train', axis=1)
test_processed = combined[combined['is_train'] == 0].drop(['is_train', 'SalePrice'], axis=1)


In [14]:
# --- FINAL SPLIT ---
X = train_processed.drop(["SalePrice", "Id"], axis=1)
y = train_processed["SalePrice"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

In [15]:
# --- SAVE ID Columns (Optional) ---
train_ids = train_processed["Id"]
test_ids = test_processed["Id"]


In [16]:
# --- Export Datasets (Optional) ---
X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)
test_processed.drop("Id", axis=1).to_csv("X_test.csv", index=False)


In [17]:
from google.colab import files

# Download processed datasets
files.download("X_train.csv")
files.download("X_val.csv")
files.download("y_train.csv")
files.download("y_val.csv")
files.download("X_test.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>