In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict  # For K-Fold Cross Validation
from sklearn.metrics import r2_score  # For find accuracy with R2 Score
from sklearn.metrics import mean_squared_error  # For MSE
from math import sqrt

In [None]:
file = '/kaggle/input/delhi-house-price-prediction/MagicBricks.csv'
data = pd.read_csv(file)
data.head()

In [None]:
data.tail()

In [None]:
print('Rows :', data.shape[0])
print('Columns :', data.shape[1])
print('\n Features', data.columns.tolist())
print('\n Missing Values', data.isnull().sum().values.sum())
print('\nUnique Values', data.nunique())

- It consists of total 11 features
- Total data is 1259

In [None]:
data.info()

In [None]:
data.columns

In [None]:
print(data['Type'].unique())
print(data['Transaction'].unique())
print(data['Status'].unique())
print(data['Furnishing'].unique())

In [None]:
plt.figure(figsize=(12,12))
plt.subplot(2,2,1)
data['Type'].value_counts().plot(kind='bar')
plt.subplot(2,2,2)
data['Transaction'].value_counts().plot(kind='bar')
plt.subplot(2,2,3)
data['Status'].value_counts().plot(kind='bar')
plt.subplot(2,2,4)
data['Furnishing'].value_counts().plot(kind='bar')

- Builder floor is sold more than apartments
- Resale property is more sold than new property
- Approach ready to move building than almost ready, since it mostly opt by the customers
- Semi furnished is more opt than unfurished and furnished

In [None]:
corr = data.corr()
corr

- Price of the house depends mainly on **number of Bathrooms**, **(BHK & Area)**

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(corr,cmap=sns.diverging_palette(220,20,as_cmap=True), vmax=1, center=0, square=True, linewidths=.8, cbar_kws={'shrink':.82}  )

In [None]:
corr['Price'].sort_values(ascending=False)

In [None]:
sns.set(style="ticks")
sns.pairplot(data, palette=["#FA5858", "#58D3F7"])

In [None]:
sns.scatterplot(x='Price',y = 'Bathroom',data=data, hue='Type')

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(12,12))
plt.subplot(2,2,1)
sns.boxplot('Area',data=data)
plt.subplot(2,2,2)
sns.boxplot('Parking',data=data)
plt.subplot(2,2,3)
sns.boxplot('Price',data=data)
plt.subplot(2,2,4)
sns.boxplot('Bathroom',data=data)

In [None]:
print(data.groupby('Type')['Price'].median())
print(data.groupby('Bathroom')['Price'].median())
print(data.groupby('Furnishing')['Price'].median())

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()

In [None]:
data.isnull().sum().values.sum()

In [None]:
data.head()

In [None]:
Loc = pd.DataFrame(data.groupby('Locality')['Price','Bathroom'].min())
Loc.head(10)

In [None]:
data = data.drop(['Locality','Furnishing','Status','Transaction'], axis=1)
data.head()


In [None]:
cat = ['Type']
data = pd.get_dummies(data, columns=cat, drop_first = True)
data.head()
print(len(data.columns))


In [None]:
data = data.reset_index()
data.head()

In [None]:
X = data.drop(['Price'], axis=1)
Y = data['Price']

sc = StandardScaler()
X = sc.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0)

In [None]:
model_1 = LinearRegression()
model_1.fit(X_train, Y_train)

pred = model_1.predict(X_test)

print('R2 Score for Linear Regression: ',r2_score(Y_test, pred))

rmse = sqrt(mean_squared_error(Y_test,pred))
print('RMSE value for Linear Regression: ',rmse)

pred_cv = cross_val_predict(model_1, X, Y, cv=10)
print('R2 score for Linear Regression( Cross Validation ): ',r2_score(Y, pred_cv))

res = pd.DataFrame({'Actual':Y_test, 'Predicted':pred})
print('\n',res.head(10))

In [None]:
poly = PolynomialFeatures(degree=2)
x_train = poly.fit_transform(X_train)
x_test = poly.fit_transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(x_train,Y_train)

pred = poly_reg.predict(x_test)
print('R2 score for Polynomial Regression',r2_score(Y_test, pred))


In [None]:
model_2 = DecisionTreeRegressor(max_depth=4)
model_2.fit(X_train,Y_train)
pred = model_2.predict(X_test)

print('R2 Score for Linear Regression: ',r2_score(Y_test, pred))

rmse = sqrt(mean_squared_error(Y_test,pred))
print('RMSE value for Linear Regression: ',rmse)

pred_cv = cross_val_predict(model_2, X, Y, cv=10)
print('R2 score for Linear Regression( Cross Validation ): ',r2_score(Y, pred_cv))

res = pd.DataFrame({'Actual':Y_test, 'Predicted':pred})
print('\n',res.head(10))

In [None]:
model_3 = RandomForestRegressor(n_estimators=400,max_depth=7)
model_3.fit(X_train,Y_train)
pred = model_3.predict(X_test)

print('R2 Score for Linear Regression: ',r2_score(Y_test, pred))

rmse = sqrt(mean_squared_error(Y_test,pred))
print('RMSE value for Linear Regression: ',rmse)

pred_cv = cross_val_predict(model_3, X, Y, cv=10)
print('R2 score for Linear Regression( Cross Validation ): ',r2_score(Y, pred_cv))

res = pd.DataFrame({'Actual':Y_test, 'Predicted':pred})
print('\n',res.head(10))

In [None]:
from sklearn.metrics import accuracy_score
model_4 = SVR(gamma='auto', kernel='linear', C=1)
model_4.fit(X_train,Y_train)

pred = model_4.predict(X_test)

print('R2 Score for Linear Regression: ',r2_score(Y_test, pred))

rmse = sqrt(mean_squared_error(Y_test,pred))
print('RMSE value for Linear Regression: ',rmse)

pred_cv = cross_val_predict(model_3, X, Y, cv=10)
print('R2 score for Linear Regression( Cross Validation ): ',r2_score(Y, pred_cv))

res = pd.DataFrame({'Actual':Y_test, 'Predicted':pred})
print('\n',res.head(10))

In [None]:
data.head(1)

In [None]:
input_data = {'index':[1],
            'Area': [750],
             'BHK':[2],
             'Bathroom':[2],
             'Parking':[1.0],
             'Per_Sqft':[6667.0],
             'Type_Builder_Floor':[0]}
input_data = pd.DataFrame(input_data)

In [None]:
input_data

In [None]:
input_data = sc.fit_transform(input_data)
input_data = input_data.reshape(1,-1)
input_data.shape

In [None]:
model_3.predict(input_data)