In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [16]:
df = pd.read_csv('outliers_treated.csv')

In [17]:
df.isnull().sum()

Unnamed: 0                0
property_type             0
society                   1
sector                    0
price                     0
price_per_sqft            0
area                      0
areaWithType              0
bedRoom                   0
bathroom                  0
balcony                   0
floorNum                 17
facing                 1063
agePossession             0
super_built_up_area    1735
built_up_area          2042
carpet_area            1752
study room                0
servant room              0
store room                0
pooja room                0
others                    0
furnishing_type           0
luxury_score              0
outliers                  0
area_room_ratio           0
dtype: int64

In [18]:
df[df['society'].isnull()]

Unnamed: 0.1,Unnamed: 0,property_type,society,sector,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,...,carpet_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score,outliers,area_room_ratio
2580,2693,flat,,sector 78,0.6,3692.0,1625.0,Built Up area: 1625 (150.97 sq.m.),2,2,...,,0,0,0,0,0,0,0,1,812.5


In [19]:
df.drop(index=2580, inplace=True)

In [20]:
df.loc[df['agePossession'] == 'Undefined','agePossession'] = np.nan

In [21]:
df.isnull().sum()

Unnamed: 0                0
property_type             0
society                   0
sector                    0
price                     0
price_per_sqft            0
area                      0
areaWithType              0
bedRoom                   0
bathroom                  0
balcony                   0
floorNum                 16
facing                 1062
agePossession           315
super_built_up_area    1734
built_up_area          2042
carpet_area            1751
study room                0
servant room              0
store room                0
pooja room                0
others                    0
furnishing_type           0
luxury_score              0
outliers                  0
area_room_ratio           0
dtype: int64

In [22]:
x = df.drop(columns=['price','areaWithType','outliers','area_room_ratio','facing','sector','society'])
y = df['price']

In [23]:
from sklearn.model_selection import train_test_split,GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [24]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.ensemble import RandomForestRegressor

Imputer methods only take numerical values  
agePossession column is categorical and has missing values.  
so we have to encode first and then impute it outside of regular nested column transformer 

In [25]:
agePossession_pipeline = Pipeline([
    ('OHE',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')),
    ('imputer',SimpleImputer(strategy='most_frequent'))
])


preprocessor_1 = ColumnTransformer(transformers=
    [
('agePossession_encoding',agePossession_pipeline,['agePossession']),
('OHE',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['property_type']),
('Ordinal',OrdinalEncoder(categories=[['0','1','2','3','3+']]),['balcony']),
('simpleimputer',SimpleImputer(),['floorNum','super_built_up_area','built_up_area','carpet_area'])
    ],remainder='passthrough')


preprocessor_2 = ColumnTransformer(transformers=
    [('agePossession_encoding',agePossession_pipeline,['agePossession']),
('OHE',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['property_type']),
('Ordinal',OrdinalEncoder(categories=[['0','1','2','3','3+']]),['balcony']),
('KNNImputer',KNNImputer(),['floorNum','super_built_up_area','built_up_area','carpet_area'])
    ],remainder='passthrough')


preprocessor_3 = ColumnTransformer(transformers=
    [('agePossession_encoding',agePossession_pipeline,['agePossession']),
('OHE',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['property_type']),
('Ordinal',OrdinalEncoder(categories=[['0','1','2','3','3+']]),['balcony']),
('IterativeImputer',IterativeImputer(),['floorNum','super_built_up_area','built_up_area','carpet_area'])
    ],remainder='passthrough')

In [26]:
pipe = Pipeline(
    [
    ('preprocessor', None),
    ('scaling', StandardScaler()),
    ('model', RandomForestRegressor(random_state=0,n_estimators=100))
    ]
)

In [27]:
param_grid = [
    {
    'preprocessor':[preprocessor_1],
    'preprocessor__simpleimputer__strategy':['mean','median','most_frequent']
     },

    {
    'preprocessor':[preprocessor_2],
    'preprocessor__KNNImputer__n_neighbors':[5,10,20],
    'preprocessor__KNNImputer__weights':['uniform','distance']
    },

    {'preprocessor':[preprocessor_3],
     'preprocessor__IterativeImputer__max_iter':[10,20,30],
     'preprocessor__IterativeImputer__imputation_order':['ascending','descending','random'],
     'preprocessor__IterativeImputer__estimator':[RandomForestRegressor(random_state=0,n_estimators=100)]
    }
]

In [28]:
grid = GridSearchCV(pipe,param_grid=param_grid,cv=5,scoring='neg_mean_absolute_error',verbose=1)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [29]:
grid.best_params_

{'preprocessor': ColumnTransformer(remainder='passthrough',
                   transformers=[('agePossession_encoding',
                                  Pipeline(steps=[('OHE',
                                                   OneHotEncoder(drop='first',
                                                                 handle_unknown='ignore',
                                                                 sparse_output=False)),
                                                  ('imputer',
                                                   SimpleImputer(strategy='most_frequent'))]),
                                  ['agePossession']),
                                 ('OHE',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['property_type']),
                                 ('Ordinal',
         

# Best Imputing Method is Simple Imputer (Median), Which is weird

In [30]:
y_hat = grid.predict(x_test)
from sklearn.metrics import mean_squared_error,r2_score
mse = mean_squared_error(y_test,y_hat)
r2 = r2_score(y_test,y_hat)
print('MSE:',mse)
print('R2:',r2)

MSE: 0.08154686702739718
R2: 0.9884468592069382


# Adjusted R2

In [31]:

n = x_train.shape[0]  # number of samples
p = x_train.shape[1]  # number of features
adjusted_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print('Adjusted R2:', adjusted_r2)


Adjusted R2: 0.988371113977446
