In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For visvalization
import matplotlib.pyplot as plt

import seaborn as sns

import plotly.express as px
import plotly.graph_objs as go

# For data preprocessing
from geopy.geocoders import Nominatim

In [None]:
# Reading the files
df_banglore=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Bangalore.csv')
df_chennai=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Chennai.csv')
df_delhi=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Delhi.csv')
df_hyderabad=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Hyderabad.csv')
df_kolkata=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Kolkata.csv')
df_mumbai=pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Mumbai.csv')

In [None]:
df_mumbai.head()

In [None]:
##### Combining the files into one
df_banglore['City']='Banglore'
df_chennai['City']='Chennai'
df_delhi['City']='Delhi'
df_hyderabad['City']='Hyderabad'
df_kolkata['City']='Kolkata'
df_mumbai['City']='Mumbai'

df=pd.concat([df_banglore, df_chennai, df_delhi, df_hyderabad, df_kolkata, df_mumbai]).reset_index(drop=True)
df.head()

In [None]:
# Adding the latitute and longitute for the places
location=Nominatim(user_agent="http")

def get_location_code(x):
    try:
        cordinates=location.geocode(x, country_codes='INR')
        return cordinates.latitude, cordinates.longitude  
    except:
        return "Not found", "Not found"

In [None]:
# Finding latitude and longitude for unique loactions only to speed up the execution
unique_location=df['Location'].unique()
location_to_code={}
for place in unique_location:
    location_to_code[place]=get_location_code(place) # This can a while, for me it took 15 mins. Please save this file.

In [None]:
# Creating two new columns having latitude and longitude
df['Latitude'],df['Longitude']=zip(*df['Location'].map(location_to_code))

In [None]:
# Removing the 'Not found' records
df=df[df['Latitude']!='Not found'].reset_index(drop=True)

#### Since for a set of houses, nothing was mentioned about certain amenities, '9' was used to mark such values, which could indicate the absence of information about the apartment. We will be dropping these values.

In [None]:
df.replace(9, np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
# Converting rupees to lakhs
df['Price']= df['Price']/10**5

In [None]:
# Saving the cleaned data file as it takes plenty of time build.
df.to_csv('Indian House Prices.csv', index=False)

## EDA

In [None]:
file_path='../input/indian-house-price-combined/Indian House Prices.csv'
df=pd.read_csv(file_path, index_col=False)

In [None]:
df=df.iloc[:,1:]

In [None]:
def plot_city_variation(col_name, title):
    sns.set_style("whitegrid") # Background grid style
    ax=sns.catplot(x='City', y=col_name, data=df, jitter=0.3, height=6, aspect=2)

    # For aesthetic purpose
    ax.fig.suptitle(title, fontsize=20);
    ax.set_xlabels('Cities',fontsize=15);
    ax.set_ylabels(col_name,fontsize=15);

### Variation of House Prices

In [None]:
plot_city_variation('Price', 'House price variation in various cities')

### Variation of House Areas

In [None]:
plot_city_variation('Area', 'House area variation in various cities')

### Number of bedrooms compared to price

In [None]:
def cat_plot(col_name, title):
    
    sns.set_style("whitegrid") # Background grid style
    ax=sns.catplot(x=col_name, y='Price', data=df, hue='City',kind='bar', height=6, aspect=2)

    # For aesthetic purpose
    ax.fig.suptitle(title, fontsize=20);
    ax.set_xlabels(col_name,fontsize=15);
    ax.set_ylabels('Price',fontsize=15);

In [None]:
cat_plot('No. of Bedrooms','Number of bedrooms compared to price')

#### The length of the black line on the each bar shows the variation of the values. Bottom of the line represents the minimum value and top represents maximum value and the height of the bar is the mean value.

In [None]:
df_price=df.groupby(['City','No. of Bedrooms']).mean('Price').reset_index()
df_price.head()

In [None]:
fig=px.bar(data_frame=df_price, x='No. of Bedrooms', y='Price', color='City',barmode='relative')
fig.update_layout(title=dict(text='Average house price variation across Cities', xanchor='center', yanchor='top', x=0.5))

In [None]:
df2=df.groupby(['City']).mean('Price').reset_index()
df2

In [None]:
fig=px.pie(data_frame=df2, names='City', values='Price')
fig.update_layout(title=dict(text='House price across Cities', xanchor='center', yanchor='top', x=0.5))
fig.show()

In [None]:
fig=px.scatter(data_frame=df, x='Area', y='Price', color='City', hover_name='Location')
fig.update_layout(title=dict(text='House price Vs. Area across Cities', xanchor='center', yanchor='top', x=0.5))
fig.show()

In [None]:
df3=df.groupby(['Location','City']).mean(['Price']).reset_index()
fig=px.sunburst(data_frame=df3, path=['City','Location'], values='Price')
fig

In [None]:
df4=df.groupby(['Location','City']).mean(['Price']).reset_index()
df4['India']='India'
fig=px.treemap(data_frame=df4, path=['India','City','Location'], values='Price')
fig

In [None]:
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

In [None]:
city_map = folium.Map(location=[19.08,72.74], zoom_start=10, tiles='cartodbpositron')

mc=MarkerCluster()
for idx, row in df.iterrows():
    mc.add_child(Marker((row['Latitude'], row['Longitude'])))
    
city_map.add_child(mc)
city_map

## If you like, don't forget to upvote it!