In [17]:
import pandas as pd
import numpy as np

In [18]:
data=pd.read_csv('Hyderabad.csv')

In [19]:
X=data.iloc[:,1:]
y=data.iloc[:,0]

In [20]:
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

In [21]:
X[numerical_columns] = X[numerical_columns].fillna(X[numerical_columns].mean())
X[categorical_columns] = X[categorical_columns].fillna(X[categorical_columns].mode().iloc[0])

In [22]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler=StandardScaler()

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [24]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [25]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [28]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,mean_squared_error,r2_score

In [31]:
for name, model in models.items():
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    model_pipeline.fit(X_train, y_train)
    predictions = model_pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f'{name} - Mean Absolute Error: {mae}')
    print(f'{name} - Mean Squared Error: {mse}')
    print(f'{name} - R2 Score: {r2}')
    print()

Linear Regression - Mean Absolute Error: 2085271.7370895604
Linear Regression - Mean Squared Error: 14800582244049.262
Linear Regression - R2 Score: 0.7652489922522872

Random Forest - Mean Absolute Error: 1496067.0386240922
Random Forest - Mean Squared Error: 19891896469873.062
Random Forest - R2 Score: 0.6844960106759749

Gradient Boosting - Mean Absolute Error: 1867128.6349255892
Gradient Boosting - Mean Squared Error: 16529199835335.38
Gradient Boosting - R2 Score: 0.7378315086105206



In [32]:
import joblib
joblib.dump(model_pipeline, 'house_price_model.pkl')
print("Model saved as 'house_price_model.pkl'")

Model saved as 'house_price_model.pkl'
