In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:

# Load dataset
df1 = pd.read_csv("bengaluru_house_prices.csv")
df1.head()


In [None]:

# Initial data inspection
print("Shape:", df1.shape)
print("Columns:", df1.columns.tolist())
summary = pd.DataFrame({
    'dtype': df1.dtypes.astype(str),  
    'missing': df1.isnull().sum(),
    'unique': df1.nunique()
}).sort_values(by='missing', ascending=False)
summary


In [None]:

# Drop columns with too many missing values or not useful
df2 = df1.drop(['area_type','society','balcony','availability'], axis='columns')
df2 = df2.dropna()
df2.shape


In [None]:

# Add BHK feature
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))
df2.bhk.unique()


In [None]:

# Clean total_sqft
def convert_sqft_to_num(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        elif any(unit in x for unit in ['Sq. Meter', 'Sq. Yards', 'Acres', 'Guntha', 'Grounds', 'Cents', 'Perch']):
            return None  # Simplified: drop rare units
        return float(x)
    except:
        return None

df2['total_sqft'] = df2['total_sqft'].apply(convert_sqft_to_num)
df2 = df2.dropna(subset=['total_sqft'])


In [None]:

# Create price_per_sqft
df2['price_per_sqft'] = df2['price']*100000 / df2['total_sqft']
df2 = df2[df2['price_per_sqft']<df2['price_per_sqft'].quantile(0.95)]


In [None]:

# Reduce dimensionality of location
df2['location'] = df2['location'].apply(lambda x: x.strip())
location_stats = df2['location'].value_counts()
location_less_than_10 = location_stats[location_stats <= 10]
df2['location'] = df2['location'].apply(lambda x: 'other' if x in location_less_than_10 else x)


In [None]:

# Final cleaned data
df3 = df2.drop(['size', 'price_per_sqft'], axis='columns')
dummies = pd.get_dummies(df3['location'])
df4 = pd.concat([df3.drop('location', axis=1), dummies], axis=1)


In [None]:

# Features and Target
X = df4.drop(['price'], axis=1)
y = df4['price']


## Model Selection using GridSearchCV on Ensemble Methods

In [None]:

from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

def find_best_ensemble_model(X, y):
    algos = {
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        },
        'extra_trees': {
            'model': ExtraTreesRegressor(),
            'params': {
                'n_estimators': [100],
                'max_depth': [None, 10]
            }
        },
        'ada_boost': {
            'model': AdaBoostRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.5, 1.0]
            }
        },
        'xgboost': {
            'model': XGBRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

result_df = find_best_ensemble_model(X, y)
print(result_df)
