In [21]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
# data read
df = pd.read_csv('/content/Bengaluru_House_Data.csv',encoding="utf-8")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


## 1. Data Loading and Preprocessing.

In these steps, we will load the Bengaluru House Data dataset using pandas and perform an initial exploration to understand its structure and contents.

- **Data Import:** The dataset is loaded into a pandas DataFrame named `data`.
- **Shape:** The dataset contains 12,530 rows and 7 columns after initial cleaning.
- **Columns:**  
    - `location`: Area or locality of the property  
    - `size`: Number of bedrooms (e.g., "2 BHK", "4 Bedroom")  
    - `total_sqft`: Total area in square feet  
    - `bath`: Number of bathrooms  
    - `price`: Price of the property (in lakhs)  
    - `bhk`: Extracted number of bedrooms as integer  
    - `price_per_sqft`: Price per square foot

We will also check for missing values, data types, and unique values in key columns to guide further cleaning and preprocessing steps. This foundational understanding helps in identifying potential issues such as outliers, inconsistent data, and the need for encoding categorical variables.

In [23]:
df.shape

(13320, 9)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [25]:
# check null
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [26]:
# 1.check DISTINCT VALUES  in each columns --> categorical values
df.nunique()
df["area_type"].dtype=="object"
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [27]:
for i in df.columns:
  if df[i].dtype == 'object':
    print(df[i].nunique())
    print(df[i].value_counts())

    print('*'*20)

# The above loop prints the value counts for each column in the DataFrame,
# helping to understand the distribution and frequency of unique values in every column.

4
area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
81
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
1305
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64
**************

In [28]:
df["total_sqft"].head(35)

Unnamed: 0,total_sqft
0,1056
1,2600
2,1440
3,1521
4,1200
5,1170
6,2732
7,3300
8,1310
9,1020


In [29]:
# check null value of each columns
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [30]:
# drop
df.drop(columns=['availability', 'society', 'balcony'],
                          inplace=True)

In [31]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,1
size,16
total_sqft,0
bath,73
price,0


In [32]:
df["location"].mode()[0]

'Whitefield'

In [33]:
# df["location"].value_counts().index[0]
df["location"].fillna(df["location"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["location"].fillna(df["location"].mode()[0],inplace=True)


In [34]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
size,16
total_sqft,0
bath,73
price,0


In [35]:
# null value replace
df["bath"].value_counts()
df["bath"].fillna(df["bath"].value_counts().index[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["bath"].fillna(df["bath"].value_counts().index[0],inplace=True)


In [36]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
size,16
total_sqft,0
bath,0
price,0


In [37]:
df["size"].mode()

Unnamed: 0,size
0,2 BHK


In [38]:
df["size"].mode()[0]

'2 BHK'

In [39]:
df["size"].mode()[0]
df["size"].fillna(df["size"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["size"].fillna(df["size"].mode()[0],inplace=True)


In [40]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
size,0
total_sqft,0
bath,0
price,0


In [41]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [42]:
# Duplicate value check and remove
df.duplicated().sum()

# remove duplicated
df.drop_duplicates(inplace=True)

In [43]:
df.duplicated().sum()

np.int64(0)

In [44]:
# ====
df.to_csv("clean_data.csv",index=False)

## Preprocessing endcoding part

In [45]:
# file
df = pd.read_csv('/content/clean_data.csv')
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [46]:
# size columns cleaning
df["size"].str.split(" ")[0][0]

'2'

In [47]:
def clean_size(x):
  return x.split(" ")[0]

clean_size("2 BHK")

'2'

In [48]:
df["bhk"] = df["size"].apply(clean_size)

In [49]:
d = df["total_sqft"][30].split("-")
for i in d:

  print(i)
  print(i.strip())
  print(type(i.strip()))
  i = float(i)



2100 
2100
<class 'str'>
 2850
2850
<class 'str'>


In [52]:
# count =0
# for i in df["total_sqft"]:
#   try:
#     return float(i)
#   except:
#     try:
#       print(i)
#       d =i.split("-")


#      return ((float(d[0].strip())+float(d[1].strip()))/2 )
#     except:
#       print(i)
#       count+=1
#       return np.nan

# print(count)


In [53]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [54]:
df["total_sqft"]=df["total_sqft"].apply(convertRange)

In [55]:
df.isnull().sum()

Unnamed: 0,0
area_type,0
location,0
size,0
total_sqft,46
bath,0
price,0
bhk,0


In [56]:
df["total_sqft"].fillna(df["total_sqft"].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_sqft"].fillna(df["total_sqft"].mean(),inplace=True)


In [57]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [58]:
# df.drop(columns=["Unnamed: 0","size"],inplace=True)
# df.head()

## encoding
* types
  1. nominal encoding
  2. ordinal encodnig
  3. one-hot encoding/ df_dummies
  

In [59]:
# sk-learn
# preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

label= LabelEncoder()
l = label.fit_transform(df["area_type"])
print(l[:50])

[3 2 0 3 3 3 3 3 3 2 3 2 3 0 2 3 3 3 3 3 0 3 2 3 3 3 3 0 3 3 3 3 3 3 0 3 3
 3 3 0 0 3 3 2 3 2 3 3 2 0]


In [60]:
ord = OrdinalEncoder(categories=[["Super built-up  Area","Built-up  Area","Plot  Area","Carpet  Area"]])
o = ord.fit_transform(df["area_type"].values.reshape(-1,1))
print(o[:50])

[[0.]
 [2.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [2.]
 [0.]
 [2.]
 [0.]
 [1.]
 [2.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [2.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [2.]
 [0.]
 [2.]
 [0.]
 [0.]
 [2.]
 [1.]]


In [61]:
for i in zip(df["area_type"],o):
  print(i)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Built-up  Area', array([1.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Plot  Area', array([2.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Built-up  Area', array([1.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Built-up  Area', array([1.]))
('Plot  Area', array([2.]))
('Built-up  Area', array([1.]))
('Built-up  Area', array([1.]))
('Super built-up  Area', array([0.]))
('Plot  Area', array([2.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Super built-up  Area', array([0.]))
('Built-up  Area', array([1.]))
('Built-up  Area', array([1.]))
('Super built-up  Area', array([0.]))
('Super built-up  A

In [62]:
ohe = OneHotEncoder()
o =ohe.fit_transform(df[["area_type"]]).toarray()

In [63]:
ohe.get_feature_names_out()

array(['area_type_Built-up  Area', 'area_type_Carpet  Area',
       'area_type_Plot  Area', 'area_type_Super built-up  Area'],
      dtype=object)

In [64]:
types = pd.DataFrame(o,columns=ohe.get_feature_names_out())

In [65]:


df1 = pd.concat([df,types],axis=1)
df1

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,0.0,0.0,0.0,1.0
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,0.0,0.0,1.0,0.0
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,1.0,0.0,0.0,0.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,0.0,0.0,0.0,1.0
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.00,2,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
12504,Super built-up Area,Green Glen Layout,3 BHK,1715.0,3.0,112.00,3,0.0,0.0,0.0,1.0
12505,Built-up Area,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,1.0,0.0,0.0,0.0
12506,Super built-up Area,Richards Town,4 BHK,3600.0,5.0,400.00,4,0.0,0.0,0.0,1.0
12507,Built-up Area,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,1.0,0.0,0.0,0.0


In [66]:
ohe = OneHotEncoder()
l =ohe.fit_transform(df[["location"]]).toarray()
l

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
location = pd.DataFrame(l,columns=ohe.get_feature_names_out())
location
df1 = pd.concat([df1,location],axis=1)
df1.drop(columns=["area_type","location", "size"],inplace=True)

KeyError: "['area_type' 'location' 'size'] not found in axis"

In [None]:
df.head()

In [69]:
df1

Unnamed: 0,total_sqft,bath,price,bhk,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,location_ Anekal,location_ Banaswadi,...,location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,"location_white field,kadugodi",location_whitefiled
0,1056.0,2.0,39.07,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2600.0,5.0,120.00,4,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1440.0,2.0,62.00,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1521.0,3.0,95.00,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200.0,2.0,51.00,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12504,1715.0,3.0,112.00,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12505,3453.0,4.0,231.00,5,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12506,3600.0,5.0,400.00,4,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12507,1141.0,2.0,60.00,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
df["location"].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [71]:
df1.head()


Unnamed: 0,total_sqft,bath,price,bhk,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,location_ Anekal,location_ Banaswadi,...,location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,"location_white field,kadugodi",location_whitefiled
0,1056.0,2.0,39.07,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2600.0,5.0,120.0,4,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1440.0,2.0,62.0,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1521.0,3.0,95.0,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200.0,2.0,51.0,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
min_max = MinMaxScaler()
min_max.fit_transform(df1[["total_sqft"]])

# train test split
from sklearn.model_selection import train_test_split

X = df1.drop(columns=["price"])

y = df1["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X_train,y_train)

In [73]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [74]:
# 2nd way

In [75]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# columns transform
columns_trans = ColumnTransformer(
    [('onehot_location', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['location']),
     ('onehot_area_type', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["area_type"]),
     ('scaler', StandardScaler(), ["total_sqft", "bath"]),

     ],
    remainder='passthrough')

# model
lr = LinearRegression()

In [88]:
df.drop(columns=["size"],inplace=True)

In [89]:
df.head()

Unnamed: 0,area_type,location,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,1200.0,2.0,51.0,2


In [90]:
from sklearn.pipeline import make_pipeline
#pipeline

pipe = make_pipeline(columns_trans,lr)

In [91]:
pipe

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [92]:
# train test split
from sklearn.model_selection import train_test_split

X = df.drop(columns=["price"])

y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [94]:
y_pred = pipe.predict(X_test)
y_pred

array([ 60.25485444, 112.34476305, 146.1885312 , ...,  65.47356622,
        41.87442415, 217.47125761])

In [95]:
y_test

Unnamed: 0,price
811,67.30
11196,72.00
5030,48.00
4974,46.00
10942,65.00
...,...
12019,50.00
12307,49.86
3447,60.00
8391,45.00


In [96]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.550995059985317

In [97]:
import pickle
pickle.dump(pipe,open("model.pkl","wb"))

In [102]:
locations = df["location"].unique()
locations
pickle.dump(locations, open("locations.pkl","wb"))

In [103]:
# carpet area
carpet_area = df["area_type"].unique()
carpet_area
pickle.dump(carpet_area, open("area_type.pkl","wb"))

In [98]:
# load the model
pipe = pickle.load(open("model.pkl","rb"))

In [99]:
# load the model
pipe = pickle.load(open("model.pkl","rb"))

# take input from user
location = input("Enter location: ")
area_type = input("Enter area type: ")
total_sqft = float(input("Enter total square feet: "))
bath = float(input("Enter number of bathrooms: "))
bhk = int(input("Enter number of bedrooms (BHK): "))

# Create a DataFrame from user input
user_input = pd.DataFrame([[location, area_type, total_sqft, bath, bhk]],
                          columns=['location', 'area_type', 'total_sqft', 'bath', 'bhk'])

# Predict the price
predicted_price = pipe.predict(user_input)

print(f"The predicted price is: {predicted_price[0]:.2f} Lakhs")

Enter location: delhi
Enter area type: carpet
Enter total square feet: 1000
Enter number of bathrooms: 2
Enter number of bedrooms (BHK): 4
The predicted price is: 68.48 Lakhs


In [None]:
import streamlit as st
import pandas as pd
import pickle

# ----------------------
# Load Model & Dropdown Data
# ----------------------
pipe = pickle.load(open("model.pkl", "rb"))
locations = pickle.load(open("locations.pkl", "rb"))
area_types = pickle.load(open("area_type.pkl", "rb"))

# ----------------------
# Streamlit App
# ----------------------
st.set_page_config(page_title="House Price Prediction", layout="centered")
st.title("🏠 House Price Prediction App")
st.write("Enter property details below to get an estimated price (in Lakhs).")

# User inputs
location = st.selectbox("Select Location", locations)
area_type = st.selectbox("Select Area Type", area_types)
total_sqft = st.number_input("Enter Total Square Feet", min_value=100.0, step=10.0)
bath = st.number_input("Enter Number of Bathrooms", min_value=1.0, step=1.0)
bhk = st.number_input("Enter Number of Bedrooms (BHK)", min_value=1, step=1)

# Prediction button
if st.button("Predict Price"):
    # Create DataFrame from inputs
    user_input = pd.DataFrame([[location, area_type, total_sqft, bath, bhk]],
                              columns=['location', 'area_type', 'total_sqft', 'bath', 'bhk'])

    # Make prediction
    predicted_price = pipe.predict(user_input)[0]

    st.success(f"💰 Estimated Price: **{predicted_price:.2f} Lakhs**")


In [106]:
import sklearn
sklearn.__version__

'1.6.1'

In [None]:
# steps
# 1. take all pkl files in one folder after training
# 2. create aap.py --> insert all code
# 3. create requirements.txt --> write all library which is needed to run project



In [None]:
import streamlit as st
import pandas as pd
import pickle

# ----------------------
# Load Model & Dropdown Data
# ----------------------
pipe = pickle.load(open("model.pkl", "rb"))
locations = pickle.load(open("locations.pkl", "rb"))
area_types = pickle.load(open("area_type.pkl", "rb"))

# ----------------------
# Streamlit App
# ----------------------
st.set_page_config(page_title="House Price Prediction", layout="centered")
st.title("🏠 House Price Prediction App")
st.write("Enter property details below to get an estimated price (in Lakhs).")

# User inputs
location = st.selectbox("Select Location", locations)
area_type = st.selectbox("Select Area Type", area_types)
total_sqft = st.number_input("Enter Total Square Feet", min_value=100.0, step=10.0)
bath = st.number_input("Enter Number of Bathrooms", min_value=1.0, step=1.0)
bhk = st.number_input("Enter Number of Bedrooms (BHK)", min_value=1, step=1)

# Prediction button
if st.button("Predict Price"):
    # Create DataFrame from inputs
    user_input = pd.DataFrame([[location, area_type, total_sqft, bath, bhk]],
                              columns=['location', 'area_type', 'total_sqft', 'bath', 'bhk'])

    # Make prediction
    predicted_price = pipe.predict(user_input)[0]

    st.success(f"💰 Estimated Price: **{predicted_price:.2f} Lakhs**")
