In [26]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

Data Upload

In [2]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\Downloads\archive 2\Indian_housing_Delhi_data.csv")

Data exploration

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        5000 non-null   object 
 1   house_size        5000 non-null   object 
 2   location          5000 non-null   object 
 3   city              5000 non-null   object 
 4   latitude          5000 non-null   float64
 5   longitude         5000 non-null   float64
 6   price             5000 non-null   int64  
 7   currency          5000 non-null   object 
 8   numBathrooms      4975 non-null   float64
 9   numBalconies      2737 non-null   float64
 10  isNegotiable      179 non-null    object 
 11  priceSqFt         0 non-null      float64
 12  verificationDate  5000 non-null   object 
 13  description       4715 non-null   object 
 14  SecurityDeposit   5000 non-null   object 
 15  Status            5000 non-null   object 
dtypes: float64(5), int64(1), object(10)
memory

In [5]:
df.head()

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,isNegotiable,priceSqFt,verificationDate,description,SecurityDeposit,Status
0,1 RK Studio Apartment,400 sq ft,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,,,,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished
1,1 RK Studio Apartment,400 sq ft,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,,,,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished
2,2 BHK Independent Floor,500 sq ft,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,,,,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished
3,3 BHK Independent House,"1,020 sq ft",Model Town,Delhi,28.712898,77.18,48000,INR,3.0,,,,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished
4,2 BHK Apartment,810 sq ft,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,,,,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished


In [6]:
df.isnull().sum()

house_type             0
house_size             0
location               0
city                   0
latitude               0
longitude              0
price                  0
currency               0
numBathrooms          25
numBalconies        2263
isNegotiable        4821
priceSqFt           5000
verificationDate       0
description          285
SecurityDeposit        0
Status                 0
dtype: int64

Feature engineering

In [10]:
# creating a column called price per square feet before encoding and scaling since there is no missing values of price and house size columns

In [9]:
df['house_size'].nunique

<bound method IndexOpsMixin.nunique of 0         400 sq ft
1         400 sq ft
2         500 sq ft
3       1,020 sq ft
4         810 sq ft
           ...     
4995    5,896 sq ft
4996    6,521 sq ft
4997    1,855 sq ft
4998    2,856 sq ft
4999    2,856 sq ft
Name: house_size, Length: 5000, dtype: object>

In [3]:
df['house_size'] = (
    df['house_size'].str.replace('sq ft', '', regex=False).str.replace(',', '', regex=False).astype(float)                          
)


In [13]:
df['house_size'].nunique

<bound method IndexOpsMixin.nunique of 0        400.0
1        400.0
2        500.0
3       1020.0
4        810.0
         ...  
4995    5896.0
4996    6521.0
4997    1855.0
4998    2856.0
4999    2856.0
Name: house_size, Length: 5000, dtype: float64>

In [4]:
df['priceSqFt'] = df['price'] / df['house_size']

In [6]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(),inplace=True)

In [7]:
df.isnull().sum()

house_type          0
house_size          0
location            0
city                0
latitude            0
longitude           0
price               0
currency            0
numBathrooms        0
numBalconies        0
isNegotiable        0
priceSqFt           0
verificationDate    0
description         0
SecurityDeposit     0
Status              0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        5000 non-null   object 
 1   house_size        5000 non-null   float64
 2   location          5000 non-null   object 
 3   city              5000 non-null   object 
 4   latitude          5000 non-null   float64
 5   longitude         5000 non-null   float64
 6   price             5000 non-null   int64  
 7   currency          5000 non-null   object 
 8   numBathrooms      5000 non-null   float64
 9   numBalconies      5000 non-null   float64
 10  isNegotiable      5000 non-null   object 
 11  priceSqFt         5000 non-null   float64
 12  verificationDate  5000 non-null   object 
 13  description       5000 non-null   object 
 14  SecurityDeposit   5000 non-null   object 
 15  Status            5000 non-null   object 
dtypes: float64(6), int64(1), object(9)
memory 

In [32]:
num_col = df.select_dtypes(include='number').columns.drop('price')
cat_col = df.select_dtypes(exclude='number').columns

In [13]:
le = LabelEncoder()
for col in cat_col:
  df[col] = le.fit_transform(df[col])

In [33]:
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        5000 non-null   float64
 1   house_size        5000 non-null   float64
 2   location          5000 non-null   float64
 3   city              5000 non-null   float64
 4   latitude          5000 non-null   float64
 5   longitude         5000 non-null   float64
 6   price             5000 non-null   float64
 7   currency          5000 non-null   float64
 8   numBathrooms      5000 non-null   float64
 9   numBalconies      5000 non-null   float64
 10  isNegotiable      5000 non-null   float64
 11  priceSqFt         5000 non-null   float64
 12  verificationDate  5000 non-null   float64
 13  description       5000 non-null   float64
 14  SecurityDeposit   5000 non-null   float64
 15  Status            5000 non-null   float64
dtypes: float64(16)
memory usage: 625.1 KB


In [34]:
x = df.drop(columns=['price'])
y = df['price']

x_train, x_temp, y_train, y_temp = train_test_split(x,y,test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
x_train.shape, x_val.shape

((4000, 15), (500, 15))

In [24]:
y_train.shape, y_val.shape


((4000,), (500,))

In [25]:
model = RandomForestRegressor(random_state=42)

In [35]:
numerical_features = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean')),
  ('scaler', StandardScaler())
])

categorical_features = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [36]:
preprocessor = ColumnTransformer(
  transformers=[
    ('num', numerical_features, num_col),
    ('cat', categorical_features, cat_col)
  ]
)

In [37]:
pipeline = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('model', model)
])

In [38]:
pipeline

In [39]:
pipeline.fit(x_train, y_train)

In [40]:
y_pred = pipeline.predict(x_val)

In [41]:
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(mse)
print(r2)

0.005854031296281585
0.9935931971240318


In [42]:
from tabulate import tabulate

In [None]:
headers = ['Model','MSE', 'R2 Score']
table = [['RandomForestRegressor', mse, r2]]
print(tabulate(table, headers=headers, tablefmt='grid'))

In [None]:
def get_change(a, b):
  d = list(a.values())
  d[0] = b
  a = 


In [28]:
def get_change(a, b):
 
  if not isinstance(a, dict):
    return

  if not a: 
    return {}

  items = list(a.items()) 
  first_key = items[0][0] 

  new_dict = a.copy()  
  new_dict[first_key] = b  

  return new_dict

my_dict = {'f': 0, 'g': 1, 'h': 2}
get_change(my_dict, 15)

{'f': 15, 'g': 1, 'h': 2}

In [33]:
my_dict = {'age':20, 'money': 1000}
d = next(iter(my_dict))
my_dict[d] = 21
my_dict

{'age': 21, 'money': 1000}

In [41]:
def get_change(dictionary, new_value):
  first_key = next(iter(dictionary))
  dictionary[first_key] = new_value
  return dictionary

In [42]:
status = {'age': 20, 'money': 2000}
status

{'age': 20, 'money': 2000}