In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [14]:
df = pd.read_csv('gurgaon_properties_missing_value_imputation.csv')

In [15]:
df.shape

(3554, 18)

In [16]:
df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,signature global park 4,sector 36,0.82,7586.0,3.0,2,2,2.0,New Property,850.0,0,0,0,0,0,2,8
1,flat,smart world gems,sector 89,0.95,8597.0,2.0,2,2,4.0,New Property,1226.0,1,1,0,0,0,2,38
2,flat,breez global hill view,sohna road,0.32,5470.0,2.0,2,1,17.0,New Property,1000.0,0,0,0,0,0,2,49
3,flat,bestech park view sanskruti,sector 92,1.6,8020.0,3.0,4,3+,10.0,Relatively New,1615.0,0,1,0,0,1,0,174
4,flat,suncity avenue,sector 102,0.48,9023.0,2.0,2,1,5.0,Relatively New,582.0,0,0,1,0,0,2,159


In [17]:
latlong = pd.read_csv('latlong.csv')

In [18]:
latlong

Unnamed: 0,sector,coordinates
0,sector 1,"28.3663° N, 76.9456° E"
1,sector 2,"28.5095° N, 77.0320° E"
2,sector 3,"28.4909° N, 77.0176° E"
3,sector 4,"28.4738° N, 77.0107° E"
4,sector 5,"28.4794° N, 77.0176° E"
...,...,...
124,sector 113,"28.5287° N, 77.0233° E"
125,sector 114,"28.5334° N, 77.0118° E"
126,sector 115,"28.5385° N, 77.0061° E"
127,gwal pahari,"28.4484° N, 77.0210° E"


In [19]:
latlong['latitude'] = latlong['coordinates'].str.split(',').str.get(0).str.split('°').str.get(0).astype('float')

In [20]:
latlong['longitude'] = latlong['coordinates'].str.split(',').str.get(1).str.split('°').str.get(0).astype('float')

In [21]:
latlong.head()

Unnamed: 0,sector,coordinates,latitude,longitude
0,sector 1,"28.3663° N, 76.9456° E",28.3663,76.9456
1,sector 2,"28.5095° N, 77.0320° E",28.5095,77.032
2,sector 3,"28.4909° N, 77.0176° E",28.4909,77.0176
3,sector 4,"28.4738° N, 77.0107° E",28.4738,77.0107
4,sector 5,"28.4794° N, 77.0176° E",28.4794,77.0176


In [22]:
new_df = df.merge(latlong, on='sector')

In [23]:
new_df.columns

Index(['property_type', 'society', 'sector', 'price', 'price_per_sqft',
       'bedRoom', 'bathroom', 'balcony', 'floorNum', 'agePossession',
       'built_up_area', 'study room', 'servant room', 'store room',
       'pooja room', 'others', 'furnishing_type', 'luxury_score',
       'coordinates', 'latitude', 'longitude'],
      dtype='object')

In [28]:
new_df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,...,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score,coordinates,latitude,longitude
0,flat,signature global park 4,sector 36,0.82,7586.0,3.0,2,2,2.0,New Property,...,0,0,0,0,0,2,8,"28.4160° N, 76.9914° E",28.416,76.9914
1,flat,signature global park 4,sector 36,1.0,9901.0,3.0,2,3,2.0,New Property,...,0,0,0,0,0,2,128,"28.4160° N, 76.9914° E",28.416,76.9914
2,flat,signature global park 4,sector 36,0.72,6660.0,3.0,2,3,3.0,Under Construction,...,0,0,0,0,0,2,0,"28.4160° N, 76.9914° E",28.416,76.9914
3,house,independent,sector 36,0.66,7166.0,2.0,2,3,4.0,New Property,...,0,0,0,0,0,2,0,"28.4160° N, 76.9914° E",28.416,76.9914
4,house,independent,sector 36,0.75,6938.0,3.0,1,0,1.0,New Property,...,0,0,0,0,0,2,0,"28.4160° N, 76.9914° E",28.416,76.9914


In [25]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3293 entries, 0 to 3292
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3293 non-null   object 
 1   society          3293 non-null   object 
 2   sector           3293 non-null   object 
 3   price            3293 non-null   float64
 4   price_per_sqft   3293 non-null   float64
 5   bedRoom          3293 non-null   float64
 6   bathroom         3293 non-null   int64  
 7   balcony          3293 non-null   object 
 8   floorNum         3293 non-null   float64
 9   agePossession    3293 non-null   object 
 10  built_up_area    3293 non-null   float64
 11  study room       3293 non-null   int64  
 12  servant room     3293 non-null   int64  
 13  store room       3293 non-null   int64  
 14  pooja room       3293 non-null   int64  
 15  others           3293 non-null   int64  
 16  furnishing_type  3293 non-null   int64  
 17  luxury_score  

In [27]:
non_numeric_values = new_df[~new_df['sector'].apply(lambda x: x.isnumeric())]
print(non_numeric_values)


     property_type                  society      sector  price  \
0             flat  signature global park 4   sector 36   0.82   
1             flat  signature global park 4   sector 36   1.00   
2             flat  signature global park 4   sector 36   0.72   
3            house              independent   sector 36   0.66   
4            house              independent   sector 36   0.75   
...            ...                      ...         ...    ...   
3288          flat        vatika xpressions  sector 88b   0.92   
3289          flat        vatika xpressions  sector 88b   0.92   
3290         house              independent  sector 17a   1.52   
3291         house              independent   sector 27   4.25   
3292         house              independent   sector 27   8.00   

      price_per_sqft  bedRoom  bathroom balcony  floorNum       agePossession  \
0             7586.0      3.0         2       2       2.0        New Property   
1             9901.0      3.0         2      

In [24]:
group_df = new_df.groupby('sector').mean()[['price','price_per_sqft','built_up_area','latitude','longitude']]

TypeError: Could not convert flatflatflatflatflatflatflatflatflatflatflatflatflatflatflatflatflatflat to numeric