In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('/Housing_Price_Data.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# 1. Data Cleaning
# Assuming 'price' and 'area' might need cleaning based on the previous attempt
for col in ['price', 'area']:
    if col in data.columns:
        data[col] = data[col].replace(',', '', regex=True).astype(float)

# Using the actual column names from the dataset for preprocessing
num_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'price']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Assuming 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus', 'parking' are categorical or can be treated as such
cat_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
# Parking might be numerical or categorical depending on its values, treating as numerical for now
# cat_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps of Input Data
# Exclude 'price' from the input features as it is the target variable
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ['area', 'bedrooms', 'bathrooms', 'stories']),
        ("cat", cat_transformer, cat_features)
    ])
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)
print(data_preprocessed.head())


# Combine preprocessing steps of Output Data (the target variable 'price')
preprocessor_Out = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ['price'])
    ], remainder='passthrough' # Keep other columns if any, though none expected for target
)
preprocessor_Out.set_output(transform="pandas")

# Apply the transformations to the Output data
data_preprocessed_Out = preprocessor_Out.fit_transform(data)
print(data_preprocessed_Out.head())


# 2. Feature Engineering
# Recalculating Price_per_sqft using the correct column names
data_preprocessed["Price_per_sqft"] = data['price'] / data['area']


# 3. Data Splitting
X = data_preprocessed
y = data_preprocessed_Out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows of the processed training data
print(X_train.head())
print(y_train.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
   num__area  num__bedrooms  num__bathrooms  num__stories  cat__mainroad_no  \
0   1.046726       1