<a href="https://colab.research.google.com/github/rishellevarghees/MLDA/blob/main/DATA_PREPROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

data = pd.read_csv('House Price India.csv', encoding="latin1")
print(data.head())


data = data.rename(columns={
    'living area': 'Size (sqft)',
    'number of bedrooms': 'Rooms',
    'Built Year': 'Age (years)',
    'Price': 'Price (INR)',
    'Postal Code': 'Location'
})

for col in ['Size (sqft)', 'Age (years)', 'Price (INR)']:
    data[col] = data[col].replace(',', '', regex=True).astype(float)

num_features = ['Size (sqft)', 'Rooms', 'Age (years)', 'Price (INR)']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_features = ['Location']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ['Size (sqft)', 'Rooms', 'Age (years)']),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

data_preprocessed = preprocessor.fit_transform(data)
print(data_preprocessed.head())

preprocessor_Out = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ['Price (INR)'])
    ]
)
preprocessor_Out.set_output(transform="pandas")

data_preprocessed_Out = preprocessor_Out.fit_transform(data)
print(data_preprocessed_Out.head())

data_preprocessed["Price_per_sqft"] = data['Price (INR)'] / data['Size (sqft)']

X = data_preprocessed
y = data_preprocessed_Out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())


           id   Date  number of bedrooms  number of bathrooms  living area  \
0  6762810635  42491                   4                 2.50         2920   
1  6762810998  42491                   5                 2.75         2910   
2  6762812605  42491                   4                 2.50         3310   
3  6762812919  42491                   3                 2.00         2710   
4  6762813105  42491                   3                 2.50         2600   

   lot area  number of floors  waterfront present  number of views  \
0      4000               1.5                   0                0   
1      9480               1.5                   0                0   
2     42998               2.0                   0                0   
3      4500               1.5                   0                0   
4      4750               1.0                   0                0   

   condition of the house  ...  Built Year  Renovation Year  Postal Code  \
0                       5  ...    