In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/delhi-house-price-prediction/MagicBricks.csv")

In [None]:
print("The Number of rows and columns in the dataset are:",df.shape)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

* There are some missing values in the columns: Bathroom, Furnishing, Parking and Type

In [None]:
df.columns

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.hist(df["Area"])
plt.xlabel("Area")
plt.ylabel("Count of Records")
plt.title("Histogram of Area")
plt.show()

* The column is right skewed

In [None]:
plt.hist(np.log(df["Area"]), bins=50)
plt.xlabel("Log (Area)")
plt.ylabel("Count of Records")
plt.title("Histogram of Log(Area)")
plt.show()

* There is concentration of data points when looked at the histogram of log of Area

In [None]:
sns.countplot(df["BHK"])
plt.show()

* There are more 3 BHK houses followed by 2 BHK, 4 BHK and 1 BHK.
* There are few records available for 5, 6, 7 and 10 BHK as well.

In [None]:
sns.countplot(df["Bathroom"])
plt.show()

* Most of the houses have 2 bathrooms.
* About 350 of the houses have 3 bathrooms.
* There are about 150 houses which have 1 bathroom or 4 bathrooms.
* Some houses do have more than 4 bathrooms as well.

In [None]:
sns.countplot(df["Furnishing"])
plt.show()

* Majority of the houses are Semi-Furnished

In [None]:
df.columns

In [None]:
df.head()

In [None]:
sns.countplot(df["Parking"])
plt.show()

In [None]:
df["Parking"].value_counts()

* Parking column have some offset values like 39 and 114. We might need to check on the validity of these entries.

In [None]:
plt.hist(df["Price"])
plt.show()

In [None]:
plt.hist(np.log(df["Price"]), bins=50)
plt.show()

* Target variable "Price" is right skewed as well.

In [None]:
sns.countplot(df["Status"])
plt.show()

* Most of the houses are in ready to move condition.

In [None]:
sns.countplot(df["Transaction"])
plt.show()

* Majority of the houses are for Resale

In [None]:
sns.countplot(df["Type"])
plt.show()

* There seems to almost equal number of house types: Apartment and Builder Floor

In [None]:
plt.hist(df["Per_Sqft"])
plt.show()

In [None]:
plt.hist(np.log(df["Per_Sqft"]), bins=50)
plt.show()

In [None]:
df.describe()

* There might be outliers in the data, specially in the columns: Area, Parking, Price and Per_Sqft

# How does the Target variable change with the independent features

In [None]:
df.columns

### Continuous numeric features

In [None]:
sns.scatterplot(x=df["Area"], y=df["Price"])
plt.show()

* The house price rises as the Area increases.
* There are some outliers with more than 10000 Area.

In [None]:
sns.scatterplot(x=df["Per_Sqft"], y=df["Price"])
plt.show()

In [None]:
sns.scatterplot(x=np.log(df["Per_Sqft"]), y=np.log(df["Price"]))
plt.show()

* There is no visible trend in the actual values of the Per_Sqft with Price. However, the their logs show near linear relationship

### Discrete numeric features

In [None]:
df.columns

In [None]:
temp = pd.pivot_table(data=df, index="BHK", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

* The houses see a rise in the Price as the number of BHK increase upto 5 and then it sees a sharp decline in the Prices for 6, 7 and 10

In [None]:
temp = pd.pivot_table(data=df, index="Bathroom", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

* The house prices normally increase as the number of bathroom increase.

In [None]:
temp = pd.pivot_table(data=df, index="Parking", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

* Upto 3 parking slots, the prices increase and further decreases.
* There is only 1 record with Parking value as 9. This could an outlier.

### Categorical Features

In [None]:
df.columns

In [None]:
temp = pd.pivot_table(data=df, index="Furnishing", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

In [None]:
temp = pd.pivot_table(data=df, index="Status", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

In [None]:
temp = pd.pivot_table(data=df, index="Transaction", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

In [None]:
temp = pd.pivot_table(data=df, index="Type", aggfunc="median")
sns.barplot(x=temp.index,y=temp.Price)
plt.show()

* The ready to move houses have a higher price.
* Other than the Status, no other categorical column seems to have any major impact on the average price of the houses.

# Model Building

### Feature Engineering

#### 1. Handling Outliers

In [None]:
df.columns

In [None]:
sns.boxplot(df["Area"])
plt.show()

In [None]:
sns.boxplot(df["BHK"])
plt.show()

In [None]:
sns.boxplot(df["Parking"])
plt.show()

In [None]:
sns.boxplot(df["Price"])
plt.show()

In [None]:
sns.boxplot(df["Per_Sqft"])
plt.show()

In [None]:
corr = df.corr()
corr["Price"].sort_values(ascending=False)

In [None]:
df_reduced = df.loc[(df["Price"]<1e8) & (df["Area"]<10000) &
           (df["BHK"]<8) & (df["Parking"]<9) & (df["Per_Sqft"]<50000)]

# df_reduced = df.loc[(df["Price"]<1e8)]

In [None]:
print("Shape of orginal dataset:", df.shape)
print("Shape of reduced dataset:", df_reduced.shape)

### 2. Handling Missing Values

In [None]:
df_reduced.info()

* There are no more missing values

In [None]:
df_reduced.head()

In [None]:
df_reduced.drop(columns="Locality", inplace=True)

In [None]:
df_reduced.head()

In [None]:
cat_cols = ["BHK", "Bathroom", "Furnishing", "Parking","Status", "Transaction", "Type"]
num_cols = ["Area", "Price", "Per_Sqft"]

In [None]:
for col in cat_cols:
    df_reduced[col] = df_reduced[col].astype(str)

In [None]:
for col in num_cols:
    df_reduced[col] = np.log(df_reduced[col])

In [None]:
df_reduced.head()

In [None]:
df_reduced = pd.get_dummies(data=df_reduced, drop_first=True)

In [None]:
df_reduced.head()

In [None]:
y = df_reduced["Price"].copy()
X = df_reduced.drop("Price", axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

In [None]:
X_train.head()

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lr_predictions = lr.predict(X_train)

print("RMSE:", np.sqrt(mean_squared_error(lr_predictions, y_train)))
print("R-Squared:", r2_score(y_train, lr_predictions))

In [None]:
plt.scatter(x=y_train, y=lr_predictions)
plt.show()

In [None]:
plt.hist(lr_predictions-y_train)
plt.show()

**This looks like a decent model:**

1. R-Squared is 0.8139
2. The predictions and the actual labels have a linear relation
3. The difference between actual and predicted values is normally distributed

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [None]:
rf_predictions = rf.predict(X_train)

print("RMSE:", np.sqrt(mean_squared_error(rf_predictions, y_train)))
print("R-Squared:", r2_score(y_train, rf_predictions))

In [None]:
plt.scatter(x=y_train, y=rf_predictions)
plt.show()

In [None]:
plt.hist(rf_predictions-y_train)
plt.show()

* The RMSE and R-Square looks better for Random Forest
* We need to check for Overfitting though

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

cv_rf_scores = cross_val_score(rf, X_train, y_train,
                        scoring="neg_mean_squared_error",cv=10)

cv_rf_rmse = np.sqrt(-cv_rf_scores)

In [None]:
def display_scores(score):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standar Deviation:", scores.std())

In [None]:
display_scores(cv_rf_rmse)

In [None]:
cv_lr_scores = cross_val_score(lr, X_train, y_train,
                               scoring="neg_mean_squared_error",cv=10)

cv_lr_rmse = np.sqrt(-cv_lr_scores)

display_scores(cv_lr_rmse)

* Both Linear Regression and Random Forest show similar cross validation scores

# Predicting on the test set

In [None]:
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

In [None]:
lr_test_predictions = lr.predict(X_test)

print("RMSE:", np.sqrt(mean_squared_error(lr_test_predictions, y_test)))
print("R-Squared:", r2_score(y_test, lr_test_predictions))

In [None]:
rf_test_predictions = rf.predict(X_test)

print("RMSE:", np.sqrt(mean_squared_error(rf_test_predictions, y_test)))
print("R-Squared:", r2_score(y_test, rf_test_predictions))

* We see that the prediction accuracy decreases on the test set, however Random Forest still preserves a higher accuracy

# Gradient Boosting, XGBoost, LGBM

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

In [None]:
gbr_predictions = gbr.predict(X_train)
print("RMSE:", np.sqrt(mean_squared_error(gbr_predictions, y_train)))
print("R-Squared:", r2_score(y_train, gbr_predictions))

In [None]:
gbr_test_predictions = gbr.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(gbr_test_predictions, y_test)))
print("R-Squared:", r2_score(y_test, gbr_test_predictions))

In [None]:
from xgboost import XGBRegressor

xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

In [None]:
xgbr_predictions = xgbr.predict(X_train)
print("RMSE:", np.sqrt(mean_squared_error(xgbr_predictions, y_train)))
print("R-Squared:", r2_score(y_train, xgbr_predictions))

In [None]:
xgbr_test_predictions = xgbr.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(xgbr_test_predictions, y_test)))
print("R-Squared:", r2_score(y_test, xgbr_test_predictions))

In [None]:
from lightgbm import LGBMRegressor

lgbmr = LGBMRegressor()
lgbmr.fit(X_train, y_train)

In [None]:
lgbmr_predictions = lgbmr.predict(X_train)
print("RMSE:", np.sqrt(mean_squared_error(lgbmr_predictions, y_train)))
print("R-Squared:", r2_score(y_train, lgbmr_predictions))

In [None]:
lgbmr_test_predictions = xgbr.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(lgbmr_test_predictions, y_test)))
print("R-Squared:", r2_score(y_test, lgbmr_test_predictions))