In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
df=pd.read_csv("bengaluru_house_prices.csv")
scalar = MinMaxScaler()
model = RandomForestRegressor()

In [4]:
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [38]:
df['location']=df['location'].str.strip()

In [39]:
df['location']=df['location'].replace(r'[^A-Za-z0-9 ]','',regex=True)

In [40]:
df['location']=df['location'].str.capitalize()

In [43]:
list(df['location'].unique())

['Electronic city phase ii',
 'Uttarahalli',
 'Lingadheeranahalli',
 'Kothanur',
 'Whitefield',
 'Marathahalli',
 '7th phase jp nagar',
 'Gottigere',
 'Sarjapur',
 'Mysore road',
 'Bisuvanahalli',
 'Raja rajeshwari nagar',
 'Manayata tech park',
 'Kengeri',
 'Binny pete',
 'Bellandur',
 'Thanisandra',
 'Mangammanapalya',
 'Electronic city',
 'Ramagondanahalli',
 'Hebbal',
 'Kanakpura road',
 'Electronics city phase 1',
 'Kundalahalli',
 'Chikkalasandra',
 'Sarjapur  road',
 'Ganga nagar',
 'Yelahanka',
 'Doddathoguru',
 'Himagiri meadows',
 'Adarsh nagar',
 'Bhoganhalli',
 'Lakshminarayana pura',
 'Begur road',
 'Govindaraja nagar ward',
 'Varthur',
 'Gunjur',
 'Hegde nagar',
 'Haralur road',
 'Hennur road',
 'Kothannur',
 'Kalena agrahara',
 'Kodanda reddy layout',
 'Garudachar palya',
 'Dasanapura',
 'Kasavanhalli',
 'Sanjay nagar',
 'Mysore highway',
 'Domlur',
 'Sarjapura  attibele road',
 'Devasthanagalu',
 'T dasarahalli',
 'Yeshwanthpur',
 'Chandapura',
 'Green view layout',
 'S

In [None]:
df['society'].nunique()

In [6]:
df.shape

(13320, 9)

In [7]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [8]:
df.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

In [9]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [10]:
df.drop(columns="society",inplace=True)

In [11]:
df.isna().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [12]:
df.dropna(inplace=True)

In [13]:
df.duplicated()
df.drop_duplicates(inplace=True)

In [14]:
df.shape

(12146, 8)

In [15]:
def process_sqft(value):
    value_str=str(value)
    if '-' in value_str:
        try:
            lower_str,upper_str=[part.strip() for part in value_str.split('-')]
            lower_value=float(lower_str)
            upper_value=float(upper_str)
            median_value=(lower_value+upper_value)/2
            return median_value
        except ValueError:
            return np.nan
    else:
        try:
            return float(value_str)
        except ValueError:
            return np.nan

In [16]:
def process_size(value):
    value_str=str(value)
    try:
        lower_val,_=[part.strip() for part in value_str.split(' ')]
        return float(lower_val)
    except ValueError:
        return np.nan

In [17]:
# Function to detect outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

In [None]:
num_cols=[]
for i in df:
    if df[i].dtypes==float:
        num_cols.append(i)
for col in num_cols:
    outliers = detect_outliers_iqr(df, col)
    print(f"Feature: {col} → Outliers detected: {len(outliers)}")

Feature: bath → Outliers detected: 831
Feature: balcony → Outliers detected: 0
Feature: price → Outliers detected: 1150


In [21]:
# Define a function to remove outliers using IQR
def remove_outliers(ln, col):
    Q1 = ln[col].quantile(0.25)  # 25th percentile
    Q3 = ln[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ln[(ln[col] >= lower_bound) & (ln[col] <= upper_bound)]

In [22]:
for col in num_cols:
    df=remove_outliers(df,col)
print("Outliers removed")

Outliers removed


In [24]:
df['total_sqft']=df['total_sqft'].apply(process_sqft)
df['size']=df['size'].apply(process_size)

In [27]:
#Done to correct the data logically

df['price_per_sqft'] = (df['price'] * 100000) / df['total_sqft']
df = remove_outliers(df,'price_per_sqft')

df = df[df['bath'] <= (df['size'] + 2)]
df = df[~(df['total_sqft'] / df['size'] < 300)]


In [28]:
df.shape

(9651, 9)

In [None]:
#Onehot encoding
df_n = pd.get_dummies(df, drop_first=True, columns=['area_type', 'location'])
#Assigning
x = df_n.drop(['price', 'availability'], axis='columns')
y = df_n['price']
#Scaling
x_numeric = x.select_dtypes(include=[np.number])
X_scaled = scalar.fit_transform(x_numeric)
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=100)
#Model training
model.fit(X_train, y_train)
cross_val_sc = cross_val_score(RandomForestRegressor(), X_scaled, y, cv=10)


In [30]:
def predict_price(location, sqft, bath, size):
    input_features = {col: 0 for col in x.columns}

    if 'total_sqft' in input_features:
        input_features['total_sqft'] = sqft
    if 'bath' in input_features:
        input_features['bath'] = bath
    if 'size' in input_features:
        input_features['size'] = size

    loc_col = f'location_{location}'
    if loc_col not in input_features:
        print(f"Warning: Location '{location}' not found. Using default 'other' category.")
        loc_col = 'location_other'
        if loc_col not in input_features:
            raise ValueError("Neither the input location nor 'other' are in the training features.")
    input_features[loc_col] = 1

    input_df = pd.DataFrame([input_features])
    input_numeric = input_df[x_numeric.columns]

    input_scaled = scalar.transform(input_numeric)

    return model.predict(input_scaled)[0]



In [31]:
predicted_val = predict_price('9th Phase JP Nagar', 2000, 2, 2)
print(f"Predicted price: {predicted_val:.4f}")

print(f"Cross Validation Score : {cross_val_sc.mean()}")
print(f"Stand alone score : {model.score(X_test,y_test)}")

Predicted price: 46.1422
Cross Validation Score : 0.99837287668402
Stand alone score : 0.9977901013451252


In [32]:
feature_list = x_numeric.columns.tolist()

dict1={'model': model, 'scaler': scalar, 'columns': feature_list}

In [33]:
feature_list

['size', 'total_sqft', 'bath', 'balcony', 'price_per_sqft']

In [34]:
import pickle
with open("house_prices_model.pkl",'wb') as obj1:
  pickle.dump(dict1,obj1)

In [35]:
with open("house_prices_model.pkl",'rb') as obj2:
  var1=pickle.load(obj2)