In [209]:
# https://www.99acres.com/property-in-delhi-ncr-ffid-page-0
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [210]:
df1 = pd.read_csv("99acres.csv")

In [211]:
df1.shape

(6700, 4)

## Explaoratory Data Analysis

In [212]:
df2 = df1.drop(['location'], axis=1)

In [213]:
# Check again duplicate data
duplicate = df2[df2.duplicated()]
duplicate.shape

(1409, 3)

In [214]:
# check null values
df2.isnull().sum()

price        0
area         0
bedroom    812
dtype: int64

In [215]:
df3 = df2.copy()
df4 = df3.dropna()

In [216]:
# Again check null values
df4.isnull().sum()

price      0
area       0
bedroom    0
dtype: int64

In [217]:
df4.shape

(5888, 3)

In [218]:
def cleaned_area():
    results_list = []
    for i, row in df4.iterrows():
        area_list = row['area'].split(" sq.ft.")
        area_str = area_list[0]

        area_sub_list = area_str.split("-")
        # area_sub_list = area_sub_list[0]

        if(len(area_sub_list)) == 2:
            min_area = area_sub_list[0].replace(",", "")
            max_area = area_sub_list[1].replace(",", "")
            final_area = (float(min_area)+float(max_area))//2
        else:
            final_area = float(area_sub_list[0].replace(",", ""))
        results_list.append(final_area)
    return results_list

In [219]:
def cleaned_price():
    results_list = []
    for i, row in df4.iterrows():
        price_list = row['price'].split("₹ ") # Split price here with this icon
        # Now we have two or three items in each row
        if(len(price_list) == 3):
            split_min_max_price = price_list[1].split("-") # Second element split with the space because first element is blank.
            if(len(split_min_max_price) == 2):
                min_price_list = split_min_max_price[0].strip().split(" ")
                max_price_list = split_min_max_price[1].strip().split(" ")
                if min_price_list[-1] == "Lac" and max_price_list[-1] == "Cr":
                    # Lac And Cr
                    min_price_string_in_cr = float(min_price_list[0])/100
                    max_price_string_in_cr = float(max_price_list[0])
                    final_price = (min_price_string_in_cr+max_price_string_in_cr)/2
                elif(max_price_list[-1] == "Lac"):
                    # Only Lac
                    min_price_string_in_cr = float(min_price_list[0])/100
                    max_price_string_in_cr = float(max_price_list[0])/100
                    final_price = (min_price_string_in_cr+max_price_string_in_cr)/2
                else:
                    # Only Cr
                    final_price = (float(min_price_list[0]) + float(max_price_list[0]))/2
            else:
                single_amount_with_unit = split_min_max_price[0].split()
                if (single_amount_with_unit[-1] == "Lac"):
                    final_price = float(single_amount_with_unit[0])/100
                else:
                    final_price = float(single_amount_with_unit[0])
        elif(len(price_list) == 2):
            split_min_max_price = price_list[1].split("-")
            if(len(split_min_max_price) == 2):
                min_price_list = split_min_max_price[0].strip().split(" ")
                max_price_list = split_min_max_price[1].strip().split(" ")
                if min_price_list[-1] == "Lac" and max_price_list[-1] == "Cr":
                    # Lac And Cr
                    min_price_string_in_cr = float(min_price_list[0])/100
                    max_price_string_in_cr = float(max_price_list[0])
                    final_price = (min_price_string_in_cr+max_price_string_in_cr)/2
                elif(max_price_list[-1] == "Lac"):
                    # Only Lac
                    min_price_string_in_cr = float(min_price_list[0])/100
                    max_price_string_in_cr = float(max_price_list[0])/100
                    final_price = (min_price_string_in_cr+max_price_string_in_cr)/2
                else:
                    # Only Cr
                    final_price = (float(min_price_list[0]) + float(max_price_list[0]))/2
            elif(max_price_list[-1] == "Lac"):
                # Only Lac
                min_price_string_in_cr = float(min_price_list[0])/100
                max_price_string_in_cr = float(max_price_list[0])/100
                final_price = (min_price_string_in_cr+max_price_string_in_cr)/2
            else:
                # Only Cr
                final_price = (float(min_price_list[0]) + float(max_price_list[0]))/2
        else:
            final_price = np.NaN
        results_list.append(float(final_price))
    return results_list

In [220]:
def separate_bedroom_and_bath():
    bedrooms = df4['bedroom'].to_list()
    results_list_1 = []
    results_list_2 = []
    for i in bedrooms:
        seaprate_from_bath_list = i.split(" Baths")
        if(len(seaprate_from_bath_list) == 2):
            bhk, baths = seaprate_from_bath_list[0].split("BHK")
        else:
            bhk = seaprate_from_bath_list[0].split(" ")[0]
            baths = bhk
        
        results_list_1.append(int(bhk.strip()))
        results_list_2.append(int(baths.strip()))
        
    return results_list_1, results_list_2

In [221]:
bhk, baths = separate_bedroom_and_bath()
df4['bhk'] = bhk
df4['baths'] = baths
df4['area'] = cleaned_area()
df4['price'] = cleaned_price()

In [222]:
df5 = df4.copy()

In [223]:
# Check Shape of data
df5.shape

(5888, 5)

In [224]:
# Check NA values
df5.isnull().sum()

price      132
area         0
bedroom      0
bhk          0
baths        0
dtype: int64

In [225]:
# Fill na values with mean of price
# df6['price_1'].fillna((df6['price_1'].mean()), inplace=True)
df6 = df5.dropna()

In [226]:
# Check NA values
df6.isnull().sum()

price      0
area       0
bedroom    0
bhk        0
baths      0
dtype: int64

In [227]:
df6.describe()

Unnamed: 0,price,area,bhk,baths
count,5756.0,5756.0,5756.0,5756.0
mean,3.291213,3322.558026,3.375782,3.489576
std,5.425275,9398.818564,1.246938,1.469669
min,0.11,1.0,1.0,1.0
25%,0.8,1440.0,3.0,3.0
50%,1.7,2097.0,3.0,3.0
75%,3.75,3045.0,4.0,4.0
max,87.0,392040.0,20.0,24.0


In [228]:
df7 = df6.drop(['bedroom'], axis=1)

In [229]:
Q1 = df7.quantile(0.25)
Q3 = df7.quantile(0.75)
IQR = Q3 - Q1

In [230]:
df8 = df7[~((df7 < (Q1 - 1.5 * IQR)) |(df7 > (Q3 + 1.5 * IQR))).any(axis=1)]
df8.shape

(4926, 4)

In [231]:
df7

Unnamed: 0,price,area,bhk,baths
0,1.900,2630.0,4,5
1,2.800,2588.0,3,3
2,1.700,1508.0,2,2
3,9.000,10500.0,5,5
4,1.775,1229.0,2,2
...,...,...,...,...
6695,1.750,2630.0,4,5
6696,7.020,8800.0,5,5
6697,4.100,4857.0,4,4
6698,2.100,3880.0,5,5


### Feature Scalling

In [232]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
# With Min Max Scaler
df_min_max_scaled = df8.copy()
col_names = ['bhk', 'baths', 'area']
features = df_min_max_scaled[col_names]
df_min_max_scaled[col_names] = min_max_scaler.fit_transform(features.values)
x = df_min_max_scaled[['bhk', 'baths', 'area']]
y = df8['price']

In [233]:
# Standardization
from sklearn.preprocessing import StandardScaler
data = df8[['bhk', 'baths', 'area']]
scaler = StandardScaler()
scaler.fit(data)
x = scaler.transform(data)
y = df8['price']

In [234]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=100)

## Model training

In [235]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
r2_score(y_test, y_pred)

0.44613654434672967

In [236]:
# KNN
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=20)
neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)
r2_score(y_test, y_pred)

0.533648541330372