In [50]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [51]:
#Load dataset
df = pd.read_csv('bangalore_house_data.csv')

In [52]:
# Drop rows with missing values
df = df.dropna()

In [53]:
df

Unnamed: 0,Location,Area (sqft),Bedrooms,Bathrooms,Parking Spaces,Age of Property (years),Floor,Total Floors,Furnishing,Property Type,Nearby Metro (Y/N),Price (Lakhs)
0,Hebbal,2286,3,2,3,12,1,4,Semi-Furnished,Independent House,N,97.97
1,Indiranagar,3029,2,2,0,29,3,10,Unfurnished,Apartment,N,232.44
2,Malleswaram,3335,3,2,3,1,2,6,Semi-Furnished,Apartment,N,184.52
3,HSR Layout,2424,3,2,2,9,1,3,Fully-Furnished,Independent House,Y,174.87
4,Whitefield,3632,2,2,2,15,1,3,Unfurnished,Apartment,Y,230.02
...,...,...,...,...,...,...,...,...,...,...,...,...
6995,Kengeri,4005,3,1,1,8,2,2,Semi-Furnished,Apartment,N,306.46
6996,Sarjapur Road,4590,1,4,0,29,3,8,Fully-Furnished,Independent House,N,266.65
6997,Marathahalli,2352,4,3,3,7,2,6,Fully-Furnished,Independent House,Y,165.68
6998,Marathahalli,854,2,5,0,5,1,8,Unfurnished,Apartment,Y,63.40


In [54]:
# Define features and target
X = df.drop('Price (Lakhs)', axis=1)
y = df['Price (Lakhs)']

In [55]:
X

Unnamed: 0,Location,Area (sqft),Bedrooms,Bathrooms,Parking Spaces,Age of Property (years),Floor,Total Floors,Furnishing,Property Type,Nearby Metro (Y/N)
0,Hebbal,2286,3,2,3,12,1,4,Semi-Furnished,Independent House,N
1,Indiranagar,3029,2,2,0,29,3,10,Unfurnished,Apartment,N
2,Malleswaram,3335,3,2,3,1,2,6,Semi-Furnished,Apartment,N
3,HSR Layout,2424,3,2,2,9,1,3,Fully-Furnished,Independent House,Y
4,Whitefield,3632,2,2,2,15,1,3,Unfurnished,Apartment,Y
...,...,...,...,...,...,...,...,...,...,...,...
6995,Kengeri,4005,3,1,1,8,2,2,Semi-Furnished,Apartment,N
6996,Sarjapur Road,4590,1,4,0,29,3,8,Fully-Furnished,Independent House,N
6997,Marathahalli,2352,4,3,3,7,2,6,Fully-Furnished,Independent House,Y
6998,Marathahalli,854,2,5,0,5,1,8,Unfurnished,Apartment,Y


In [56]:
y

0        97.97
1       232.44
2       184.52
3       174.87
4       230.02
         ...  
6995    306.46
6996    266.65
6997    165.68
6998     63.40
6999     43.11
Name: Price (Lakhs), Length: 7000, dtype: float64

In [57]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [58]:
numerical_cols

['Area (sqft)',
 'Bedrooms',
 'Bathrooms',
 'Parking Spaces',
 'Age of Property (years)',
 'Floor',
 'Total Floors']

In [59]:
categorical_cols

['Location', 'Furnishing', 'Property Type', 'Nearby Metro (Y/N)']

In [60]:
# Create Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [61]:
# Create Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [62]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
X_train

Unnamed: 0,Location,Area (sqft),Bedrooms,Bathrooms,Parking Spaces,Age of Property (years),Floor,Total Floors,Furnishing,Property Type,Nearby Metro (Y/N)
1032,Bellandur,1300,1,3,0,13,4,5,Semi-Furnished,Apartment,Y
6339,Bannerghatta Road,1505,1,4,3,21,1,1,Fully-Furnished,Apartment,N
3886,JP Nagar,4531,5,1,2,11,3,3,Semi-Furnished,Apartment,Y
2653,Marathahalli,2442,4,3,3,3,1,5,Fully-Furnished,Independent House,N
6914,Yelahanka,1364,3,3,3,21,2,2,Unfurnished,Apartment,N
...,...,...,...,...,...,...,...,...,...,...,...
3772,BTM Layout,2035,4,5,1,11,4,4,Semi-Furnished,Independent House,Y
5191,Marathahalli,2386,4,3,2,21,5,5,Fully-Furnished,Independent House,N
5226,Jayanagar,3610,4,2,1,27,4,4,Fully-Furnished,Apartment,Y
5390,HSR Layout,3828,3,4,1,17,8,9,Unfurnished,Independent House,Y


In [64]:
y_train

1032     59.88
6339    109.30
3886    181.74
2653    154.13
6914     73.86
         ...  
3772    101.92
5191    122.13
5226    286.21
5390    176.81
860     209.50
Name: Price (Lakhs), Length: 5600, dtype: float64

In [65]:
pipeline.fit(X_train, y_train)

In [66]:
y_pred = pipeline.predict(X_test)

# 12. Evaluate the Model
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Absolute Error (MAE): 30.02404051339286
Root Mean Squared Error (RMSE): 38.39919480598671
R2 Score: 0.8274676974499362


In [67]:
# Save the pipeline as a pickle file
pickle_file_path = 'LR_prediction_pipeline.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(pipeline, file)

pickle_file_path

'LR_prediction_pipeline.pkl'

In [46]:
from sklearn.ensemble import RandomForestRegressor

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_rf.fit(X_train, y_train)
y_pred = pipeline_rf.predict(X_test)

print("Random Forest:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Random Forest:
Mean Absolute Error (MAE): 29.32630928571429
Root Mean Squared Error (RMSE): 39.12206504037515
R2 Score: 0.8209106647862141


In [70]:
from sklearn.ensemble import GradientBoostingRegressor

pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_gb.fit(X_train, y_train)
y_pred = pipeline_gb.predict(X_test)

print("Gradient Boosting:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Gradient Boosting:
Mean Absolute Error (MAE): 29.376360328352664
Root Mean Squared Error (RMSE): 38.22442795649546
R2 Score: 0.8290346214108497


In [48]:
from sklearn.svm import SVR

pipeline_svr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf', C=100, epsilon=0.1))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_svr.fit(X_train, y_train)
y_pred = pipeline_svr.predict(X_test)

print("Support Vector Regressor:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Support Vector Regressor:
Mean Absolute Error (MAE): 31.54227869038585
Root Mean Squared Error (RMSE): 41.180138918400544
R2 Score: 0.8015725288714188


In [71]:
from sklearn.tree import DecisionTreeRegressor

pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_dt.fit(X_train, y_train)
y_pred = pipeline_dt.predict(X_test)

print("Decision Tree Regressor:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Decision Tree Regressor:
Mean Absolute Error (MAE): 39.405207142857144
Root Mean Squared Error (RMSE): 53.92063212259293
R2 Score: 0.6597985717240609


In [72]:
from sklearn.neighbors import KNeighborsRegressor

pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_knn.fit(X_train, y_train)
y_pred = pipeline_knn.predict(X_test)

print("K-Neighbors Regressor:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


K-Neighbors Regressor:
Mean Absolute Error (MAE): 39.43073000000001
Root Mean Squared Error (RMSE): 51.2142234763798
R2 Score: 0.6930925949071449


In [73]:
from sklearn.linear_model import Ridge

pipeline_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_ridge.fit(X_train, y_train)
y_pred = pipeline_ridge.predict(X_test)

print("Ridge Regression:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Ridge Regression:
Mean Absolute Error (MAE): 30.01614033576909
Root Mean Squared Error (RMSE): 38.39816537150095
R2 Score: 0.8274769480773148


In [74]:
from sklearn.linear_model import Lasso

pipeline_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_lasso.fit(X_train, y_train)
y_pred = pipeline_lasso.predict(X_test)

print("Lasso Regression:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))


Lasso Regression:
Mean Absolute Error (MAE): 29.895349580871564
Root Mean Squared Error (RMSE): 38.36865778873578
R2 Score: 0.827742001480851


In [75]:
# Save the pipeline as a pickle file
pickle_file_path = 'Lasso_prediction_pipeline.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(pipeline, file)

pickle_file_path

'Lasso_prediction_pipeline.pkl'