In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Load the dataset
df = pd.read_csv('//home/shubh-k-pc/Documents/AIML/house-price-prediction-model-main/data/House Price India.csv')
df.head()
df.tail()
df.describe()
df.shape
# Drop unwanted columns
cols_to_drop = ['id', 'Date', 'Postal Code', 'living_area_renov', 'lot_area_renov']
df_cleaned = df.drop(columns=cols_to_drop)

# Log-transform the target variable (Price) to reduce skewness
df_cleaned['Price'] = np.log1p(df_cleaned['Price'])

# Identify numeric features excluding the target
feature_cols = df_cleaned.drop(columns=['Price']).select_dtypes(include=[np.number]).columns.tolist()

# Remove outliers using Z-score method
z_scores = np.abs(stats.zscore(df_cleaned[feature_cols]))
df_cleaned = df_cleaned[(z_scores < 3).all(axis=1)]

# Standardize features using StandardScaler
scaler = StandardScaler()
df_cleaned[feature_cols] = scaler.fit_transform(df_cleaned[feature_cols])

# Save the cleaned dataset to CSV
df_cleaned.to_csv('house_price_india_cleaned.csv', index=False)

# Show a preview of the final cleaned DataFrame
print(df_cleaned.head())

   number of bedrooms  number of bathrooms  living area  lot area  \
0            0.786585             0.631846     1.228013 -0.497591   
1            1.965260             0.992094     1.214920 -0.059280   
2            0.786585             0.631846     1.738648  2.621614   
3           -0.392091            -0.088649     0.953055 -0.457599   
4           -0.392091             0.631846     0.809030 -0.437603   

   number of floors  waterfront present  number of views  \
0           0.01303                 0.0        -0.226341   
1           0.01303                 0.0        -0.226341   
2           0.93612                 0.0        -0.226341   
3           0.01303                 0.0        -0.226341   
4          -0.91006                 0.0        -0.226341   

   condition of the house  grade of the house  \
0                2.346761            0.396071   
1               -0.669658            0.396071   
2               -0.669658            1.343847   
3                0.838551   