In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Uttarahalli,1440.0,2.0,3.0,62.0,3
2,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
3,Kothanur,1200.0,2.0,1.0,51.0,2
4,Whitefield,1170.0,2.0,1.0,38.0,2


In [3]:
df.isna().any()

location      False
total_sqft    False
bath          False
balcony       False
price         False
BHK           False
dtype: bool

In [4]:
X = df.copy()
X = X.drop(columns=['price'])
y = df.price

In [5]:
X.head()

Unnamed: 0,location,total_sqft,bath,balcony,BHK
0,Electronic City Phase II,1056.0,2.0,1.0,2
1,Uttarahalli,1440.0,2.0,3.0,3
2,Lingadheeranahalli,1521.0,3.0,1.0,3
3,Kothanur,1200.0,2.0,1.0,2
4,Whitefield,1170.0,2.0,1.0,2


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

In [7]:
categorical_features = ['location']

# Create a ColumnTransformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

In [8]:
from sklearn.linear_model import LinearRegression
clf = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("model",LinearRegression())
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=10, test_size=0.2
)


In [10]:
clf.fit(X_train,y_train)

In [11]:
clf.score(X_test,y_test)

0.7829849447733207

In [12]:
def predict_price(location,total_sqft,bathroom,balcony,bhk):
    location_data = pd.Series([location])
    data = [location_data[0],total_sqft,bathroom,balcony,bhk]
    
    column_names = X_test.columns
    input_df = pd.DataFrame([data], columns=column_names)
    
    print(clf.predict(input_df))

In [13]:
predict_price("Indira Nagar",1000.0, 2.0,1.0, 2)

[94.72223315]


In [14]:
predict_price("Indira Nagar",1000.0, 3.0,1.0, 3)

[94.17296213]


In [15]:
predict_price("Indira Nagar",1000.0, 3.0,1.0, 2)

[98.27745863]


In [16]:
y_pred = clf.predict(X_test)

In [17]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

MAE: 16.386227577892008


In [18]:
import pickle
with open("bengaluru_house_price_model.pickle","wb") as f:
    pickle.dump(clf,f)

In [19]:
import json
columns = {
    "locations":[col for col in X.location]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [20]:

import json
with open("columns.json", "r") as file:
    data = json.load(file)
    

In [21]:
l_data = list(data.values())

In [22]:
for i in range(5):
    print(l_data[0][i])

electronic city phase ii
uttarahalli
lingadheeranahalli
kothanur
whitefield
