In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from geopy.geocoders import Nominatim

In [None]:
!pip install geopy
!pip install Nominatim

In [None]:
r_Mumbai_df = pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Mumbai.csv')
r_Delhi_df = pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Delhi.csv')
r_Chennai_df = pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Chennai.csv')
r_Hyderabad_df = pd.read_csv('../input/housing-prices-in-metropolitan-areas-of-india/Hyderabad.csv')

In [None]:
df1 = r_Mumbai_df.copy().replace(9, np.nan, inplace=False)
df2 = r_Delhi_df.copy().replace(9, np.nan, inplace=False)
df3 = r_Chennai_df.copy().replace(9, np.nan, inplace=False)
df4 = r_Hyderabad_df.copy().replace(9, np.nan, inplace=False)

In [None]:
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()

In [None]:
amentities = ["Resale","VaastuCompliant","Wardrobe","Refrigerator","Sofa","DiningTable","TV","GolfCourse","Microwave","BED","LiftAvailable","Children'splayarea","Wifi","AC","Gasconnection","WashingMachine","Hospital","MultipurposeRoom","Cafeteria","StaffQuarter","CarParking","PowerBackup","24X7Security","School","ClubHouse","ATM","SportsFacility","Intercom","ShoppingMall","IndoorGames","RainWaterHarvesting","JoggingTrack","LandscapedGardens","SwimmingPool","Gymnasium","MaintenanceStaff"]

In [None]:
df1['NumOfAmentities'] = df1[amentities].sum(axis=1)
df2['NumOfAmentities'] = df2[amentities].sum(axis=1)
df3['NumOfAmentities'] = df3[amentities].sum(axis=1)
df4['NumOfAmentities'] = df4[amentities].sum(axis=1)

In [None]:
df1 = df1.drop(amentities,axis=1)
df2 = df2.drop(amentities,axis=1)
df3 = df3.drop(amentities,axis=1)
df4 = df4.drop(amentities,axis=1)

In [None]:
df1['Price'] = df1['Price']/100000
df2['Price'] = df2['Price']/100000
df3['Price'] = df3['Price']/100000
df4['Price'] = df4['Price']/100000

In [None]:
#determine the cutoff for outliers
def iqr_fence(x):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    Lower_Fence = Q1 - (1.5 * IQR)
    Upper_Fence = Q3 + (1.5 * IQR)
    u = max(x[x<Upper_Fence])
    l = min(x[x>Lower_Fence])
    return [u,l]

In [None]:
#Removing Outliers
df1 = df1[df1['Price'] < iqr_fence(df1['Price'])[0]]
df2 = df2[df2['Price'] < iqr_fence(df2['Price'])[0]]
df3 = df3[df3['Price'] < iqr_fence(df3['Price'])[0]]
df4 = df4[df4['Price'] < iqr_fence(df4['Price'])[0]]

In [None]:
geolocator = Nominatim(user_agent="testing")

def geogeneration(df):
    lat = []
    long = []
    t = 0
    for i in df['Location']:
        location = geolocator.geocode(i, timeout=None)
        if t%100 == 0:
            print(t)
        t += 1
        try:
            lat.append(location.latitude)
            long.append(location.longitude)
        except:
            lat.append("NA")
            long.append("NA")
    df['Latitude'] = lat
    df['Longitude'] = long

In [None]:
geogeneration(df1)
geogeneration(df2)
geogeneration(df3)
geogeneration(df4)

In [None]:
df1.head()

In [None]:
# df1.to_csv('/kaggle/working/Mumbai_updated.csv')
# df2.to_csv('/kaggle/working/Delhi_updated.csv')
# df3.to_csv('/kaggle/working/Chennai_updated.csv')
# df4.to_csv('/kaggle/working/Hyderabad_updated.csv')

In [None]:
df1=pd.read_csv('../input/mdch-data/Mumbai_updated.csv')
df2=pd.read_csv('../input/mdch-data/Delhi_updated.csv')
df3=pd.read_csv('../input/mdch-data/Chennai_updated.csv')
df4=pd.read_csv('../input/mdch-data/Hyderabad_updated.csv')

In [None]:
df1.head()

In [None]:
# Dropping features from each city
df1 = df1.drop(['Unnamed: 0', 'Location'],axis=1)
df2 = df2.drop(['Unnamed: 0', 'Location'],axis=1)
df3 = df3.drop(['Unnamed: 0', 'Location'],axis=1)
df4 = df4.drop(['Unnamed: 0', 'Location'],axis=1)

In [None]:
# Identifying null values in the data set
df1.isna().sum()
df2.isna().sum()
df3.isna().sum()
df4.isna().sum()

In [None]:
#Dropped rows that contained at least 1 null value
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()

In [None]:
# Normalizing the data and fitting the model for Mumbai
x1 = df1.loc[:, 'Area':'Longitude']
y1 = df1['Price']

x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1,test_size = 0.2, random_state=365)

normalized1 = MinMaxScaler().fit(x_train1)
n1_train1 = normalized1.transform(x_train1)
n1_test1 = normalized1.transform(x_test1)

# Fit the regression with the scaled TRAIN inputs and targets
reg1 = LinearRegression()
reg1.fit(x_train1,y_train1)

y_pred1 = reg1.predict(x_test1)

# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_test1, y_pred1)
# Let's also name the axes
plt.xlabel('Targets (y_test1)',size=18)
plt.ylabel('Predictions (y_pred1)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(0,350)
plt.show()

In [None]:
#Evaluation Metrics
mae1 = mean_absolute_error(y_test1, y_pred1)
mse1 = mean_squared_error(y_test1, y_pred1)
r21 = r2_score(y_test1, y_pred1)

print('Model performance for testing set')
print('----------------------------------')
print('Mean Absolute Error is {}'.format(mae1))
print('Mean Squared Error is {}'.format(mse1))
print('R2 score is {}'.format(r21))

In [None]:
# Normalizing the data and fitting the model for Delhi
x2 = df2.loc[:, 'Area':'Longitude']
y2 = df2['Price']

x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2,test_size = 0.2, random_state=0)

normalized2 = MinMaxScaler().fit(x_train2)
n2_train2 = normalized2.transform(x_train2)
n2_test2 = normalized2.transform(x_test2)

# Fit the regression with the scaled TRAIN inputs and targets
reg2 = LinearRegression()
reg2.fit(x_train2,y_train2)

y_pred2 = reg2.predict(x_test2)

# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_test2, y_pred2)
# Let's also name the axes
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_pred)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(0,350)
plt.show()

In [None]:
#Evaluation Metrics
mae2 = mean_absolute_error(y_test2, y_pred2)
mse2 = mean_squared_error(y_test2, y_pred2)
r22 = r2_score(y_test2, y_pred2)

print('Model performance for testing set')
print('----------------------------------')
print('Mean Absolute Error is {}'.format(mae2))
print('Mean Squared Error is {}'.format(mse2))
print('R2 score is {}'.format(r22))

In [None]:
# Normalizing the data and fitting the model for Chennai
x3 = df3.loc[:, 'Area':'Longitude']
y3 = df3['Price']

x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3,test_size = 0.2, random_state=0)

normalized3 = MinMaxScaler().fit(x_train3)
n3_train1 = normalized3.transform(x_train3)
n3_test3 = normalized2.transform(x_test3)

# Fit the regression with the scaled TRAIN inputs and targets
reg3 = LinearRegression()
reg3.fit(x_train3,y_train3)

y_pred3 = reg3.predict(x_test3)

# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_test3, y_pred3)
# Let's also name the axes
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_pred)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(0,350)
plt.show()

In [None]:
#Evaluation Metrics
mae3 = mean_absolute_error(y_test3, y_pred3)
mse3 = mean_squared_error(y_test3, y_pred3)
r23 = r2_score(y_test3, y_pred3)

print('Model performance for testing set')
print('----------------------------------')
print('Mean Absolute Error is {}'.format(mae3))
print('Mean Squared Error is {}'.format(mse3))
print('R2 score is {}'.format(r23))

In [None]:
# Normalizing the data and fitting the model for Hyderabad
x4 = df4.loc[:, 'Area':'Longitude']
y4 = df4['Price']

x_train4, x_test4, y_train4, y_test4 = train_test_split(x4, y4,test_size = 0.2, random_state=0)

normalized4 = MinMaxScaler().fit(x_train4)
n4_train4 = normalized4.transform(x_train4)
n4_test4 = normalized2.transform(x_test4)

# Fit the regression with the scaled TRAIN inputs and targets
reg4 = LinearRegression()
reg4.fit(x_train4,y_train4)

y_pred4 = reg4.predict(x_test4)

# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_test4, y_pred4)
# Let's also name the axes
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_pred)',size=18)
# Sometimes the plot will have different scales of the x-axis and the y-axis
# This is an issue as we won't be able to interpret the '45-degree line'
# We want the x-axis and the y-axis to be the same
plt.xlim(0,350)
plt.show()

In [None]:
#Evaluation Metrics
mae4 = mean_absolute_error(y_test4, y_pred4)
mse4 = mean_squared_error(y_test4, y_pred4)
r24 = r2_score(y_test4, y_pred4)

print('Model performance for testing set')
print('----------------------------------')
print('Mean Absolute Error is {}'.format(mae4))
print('Mean Squared Error is {}'.format(mse4))
print('R2 score is {}'.format(r24))

In [None]:
import pickle
pickle.dump(reg1, open('mumbai.pkl', 'wb+'))
pickle.dump(reg2, open('delhi.pkl', 'wb+'))
pickle.dump(reg3, open('chennai.pkl', 'wb+'))
pickle.dump(reg4, open('hyderabad.pkl', 'wb+'))

In [None]:
loaded_model = pickle.load(open('mumbai.pkl', 'rb'))

In [None]:
result = loaded_model.predict(x_test1)