In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/house-price-prediction-challenge/train.csv')
data.head()

|Column|	Description|
|---|---|
|POSTED_BY|	Category marking who has listed the property|
|UNDER_CONSTRUCTION|	Under Construction or Not|
|RERA|	Rera approved or Not|
|BHK_NO|	Number of Rooms|
|BHKORRK|	Type of property|
|SQUARE_FT|	Total area of the house in square feet|
|READYTOMOVE|	Category marking Ready to move or Not|
|RESALE|	Category marking Resale or not|
|ADDRESS|	Address of the property|
|LONGITUDE|	Longitude of the property|
|LATITUDE|	Latitude of the property|

In [None]:
data.info()

In [None]:
# extract city from address
data['ADDRESS'] = data['ADDRESS'].str.split(',').apply(lambda x: x[-1])
data['ADDRESS'].value_counts(ascending=False).head(25)

map all cities into tier1, tier2 and tier3 based on https://en.wikipedia.org/wiki/Classification_of_Indian_cities

In [None]:
def map_city(city):
    if city in ['Ahmedabad', 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai', 'Pune', 'Maharashtra']:
        return 'tier1'
    elif city in ['Agra', 'Ajmer', 'Aligarh', 'Amravati', 'Amritsar', 'Asansol', 'Aurangabad', 'Bareilly', 
                  'Belgaum', 'Bhavnagar', 'Bhiwandi', 'Bhopal', 'Bhubaneswar', 'Bikaner', 'Bilaspur', 'Bokaro Steel City', 
                  'Chandigarh', 'Coimbatore', 'Cuttack', 'Dehradun', 'Dhanbad', 'Bhilai', 'Durgapur', 'Dindigul', 'Erode', 
                  'Faridabad', 'Firozabad', 'Ghaziabad', 'Gorakhpur', 'Gulbarga', 'Guntur', 'Gwalior', 'Gurgaon', 'Guwahati', 
                  'Hamirpur', 'Hubli–Dharwad', 'Indore', 'Jabalpur', 'Jaipur', 'Jalandhar', 'Jammu', 'Jamnagar', 'Jamshedpur', 
                  'Jhansi', 'Jodhpur', 'Kakinada', 'Kannur', 'Kanpur', 'Karnal', 'Kochi', 'Kolhapur', 'Kollam', 'Kozhikode', 
                  'Kurnool', 'Ludhiana', 'Lucknow', 'Madurai', 'Malappuram', 'Mathura', 'Mangalore', 'Meerut', 'Moradabad', 
                  'Mysore', 'Nagpur', 'Nanded', 'Nashik', 'Nellore', 'Noida', 'Patna', 'Pondicherry', 'Purulia', 'Prayagraj', 
                  'Raipur', 'Rajkot', 'Rajahmundry', 'Ranchi', 'Rourkela', 'Ratlam', 'Salem', 'Sangli', 'Shimla', 'Siliguri', 
                  'Solapur', 'Srinagar', 'Surat', 'Thanjavur', 'Thiruvananthapuram', 'Thrissur', 'Tiruchirappalli', 'Tirunelveli', 
                  'Tiruvannamalai', 'Ujjain', 'Bijapur', 'Vadodara', 'Varanasi', 'Vasai-Virar City', 'Vijayawada', 'Visakhapatnam', 
                  'Vellore', 'Warangal']:
        return 'tier2'
    else:
        return 'tier3'
    
data['city_tier'] = data['ADDRESS'].apply(map_city)

# EDA

In [None]:
# check target value distribution
data['PRICE_IN_LACS'] = np.log(data['TARGET(PRICE_IN_LACS)'])
sns.histplot(data['PRICE_IN_LACS'], bins=20)

In [None]:
# taking log of square_ft as well
data['area'] = np.log(data['SQUARE_FT'])
sns.histplot(data['area'], bins=20)

In [None]:
def plot_numerical(feature):
    sns.lmplot(x=feature, y='TARGET(PRICE_IN_LACS)', data=data)
    plt.show()
    
def plot_categorical(feature, figsize=None):
    df = data.groupby([feature])['TARGET(PRICE_IN_LACS)'].describe()[['mean', '50%', 'min', 'count']]

    labels = df.index.values
    x = np.arange(len(labels))
    width = 0.9
    fig, ax1 = plt.subplots(figsize=(8, 5))

    # plot bars for min, median and mean house price
    rects1 = ax1.bar(x-width/2, df['50%'], width/3, label='median')
    rects2 = ax1.bar(x-width/6, df['mean'], width/3, label='mean')
    rects3 = ax1.bar(x+width/6, df['min'], width/3, label='min')

    ax1.set_ylabel('PRICE_IN_LACS', fontsize=12)
    ax1.set_title(feature, fontsize=15)
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels, rotation=0)
    ax1.legend()

    # plot counts of data points
    ax2 = ax1.twinx()
    ax2.set_ylabel('Counts', fontsize=12)
    ax2.plot(x-width/2, df['count'], color='red', linestyle='dashed')

    # annotate counts of data points
    for i, rect in enumerate(rects2):
        height = int(round(rect.get_height()))
        ax1.annotate('{}'.format(int(df['count'].iloc[i])),
                     xy=(rect.get_x() + rect.get_width()/2, height),
                     xytext=(0, 3), textcoords="offset points",
                     ha='center', va='bottom', color='red')
    plt.show()

In [None]:
for feature in ['area', 'LONGITUDE', 'LATITUDE']:
    plot_numerical(feature)

In [None]:
for feature in ['POSTED_BY', 'city_tier']:
    plot_categorical(feature)

In [None]:
for feature in ['UNDER_CONSTRUCTION', 'RERA', 'BHK_OR_RK', 'READY_TO_MOVE', 'RESALE', 'BHK_NO.']:
    plot_categorical(feature)

In [None]:
df = data.groupby(['ADDRESS'])['TARGET(PRICE_IN_LACS)'].describe()[['mean', 'count']]
df = df[df['count']>20]

labels = df.index.values
x = np.arange(len(labels))
fig, ax1 = plt.subplots(figsize=(20, 5))
width = 0.9

# plot bars for min, median and mean house price
rects = ax1.bar(x, df['mean'], width, label='mean')

ax1.set_ylabel('PRICE_IN_LACS', fontsize=12)
ax1.set_title('ADDRESS', fontsize=15)
ax1.set_xticks(x)
ax1.set_xticklabels(labels, rotation=90)
ax1.legend()

# plot counts of data points
ax2 = ax1.twinx()
ax2.set_ylabel('Counts', fontsize=12)
ax2.plot(x, df['count'], color='red', linestyle='dashed')

# annotate counts of data points
for i, rect in enumerate(rects):
    height = int(round(rect.get_height()))
    ax1.annotate('{}'.format(int(df['count'].iloc[i])),
                 xy=(rect.get_x() + rect.get_width()/2, height),
                 xytext=(0, 3), textcoords="offset points",
                 ha='center', va='bottom', color='red')
plt.show()

**Columns that are contributing towards high house prices-**
* Area - +ve
* BHK - +ve with some exceptions
* UNDER_CONSTRUCTION - yes
* RERA - yes
* BHK_OR_RK - BHK
* READY_TO_MOVE - no
* RESALE - no
* ADDRESS - Bangalore, Gurgaon, Mumbai, Mohali, Chennai, Goa, Maharashtra, Ranchi, Pune, Noida, Meerut
* POSTED_BY - Builder has generally high while Owner has the least
* city_tier - cities in 'tier1' have highest house prices

In [None]:
categorical_features = ['POSTED_BY', 'BHK_OR_RK', 'city_tier', 'ADDRESS']
numerical_features = ['UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LONGITUDE', 'LATITUDE', 'area']

# Label encoding categorical features for correlation

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    le = LabelEncoder()

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    #print(feature)
    
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))

# CORRELATION

In [None]:
# Bivariate Analysis Correlation plot for numerical features
plt.figure(figsize=(10, 5))
sns.heatmap(round(data[numerical_features].corr(method='spearman'), 2), 
            annot=True, mask=None, cmap='GnBu')
plt.show()

In [None]:
# Bivariate Analysis Correlation plot with the Categorical variables
plt.figure(figsize=(10, 10))
sns.heatmap(round(df[categorical_features+numerical_features+['TARGET(PRICE_IN_LACS)']].corr(method='spearman'), 2), annot=True,
            mask=None, cmap='GnBu')
plt.show()

**Observations-**
* UNDERCONSTRUCTION - READY_TO_MOVE - are exactly the same
* BHK_NO. - SQUARE_FT

# Removing features using VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = [feature for feature in categorical_features+numerical_features if feature not in ['READY_TO_MOVE', 'area', 
                                                                                                      'RESALE', 'LATITUDE', 'LONGITUDE']]
vif["VIF"] = [variance_inflation_factor(df[vif['variables']].values, i) for i in range(len(vif["variables"]))]
print(vif)

# Looking at Outlier

In [None]:
NumericData = data[['BHK_NO.', 'TARGET(PRICE_IN_LACS)', 'SQUARE_FT']]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
# Percentage of outliers present in each variable
outlier_percentage = {}
for feature in ['area', 'SQUARE_FT', 'BHK_NO.', 'TARGET(PRICE_IN_LACS)', 'PRICE_IN_LACS']:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)
    outlier_percentage[feature] = round((((tempData<(Q1 - 1.5 * IQR)) | (tempData>(Q3 + 1.5 * IQR))).sum()/tempData.shape[0])*100,2)
outlier_percentage

**Observations-**
* Taking log of *SQUARE_FT* reduces the outlier values from 6% to 4% in *area* column. We will remove rest of the outlier by dropping those rows
* Taking log of *TARGET(PRICE_IN_LACS)* reduces the outlier from 10% to 3% in *PRICE_IN_LACS* column

In [None]:
NumericData = data[['PRICE_IN_LACS', 'area']]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
# Outlier treatment by removal or replacement
df_outlier = data.copy()
for feature in ['area']:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)    
    # by replacing values
    #df_outlier.loc[data[feature]>(Upper_range), feature] = Upper_range
    #df_outlier.loc[data[feature]<(Lower_range), feature] = Lower_range
    
    # by dropping rows
    df_outlier = data[(data[feature]>Lower_range) & (data[feature]<Upper_range)].reset_index(drop=True)

# Handling Categorical Features (Label Encoding & One Hot Encoding)¶

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    
    le = LabelEncoder()
    ohe = OneHotEncoder(sparse=False)

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))
    # load classes
    columns = joblib.load(
        open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'rb'))

    if len(le.classes_)>2 and feature!='ADDRESS':
        # perform one hot encoding
        ohe.fit(df[[feature]])
        # save the encoder
        joblib.dump(ohe, 
                    open(os.path.join(path, "TextEncoding/ohe_{}.sav".format(feature)), 'wb'))

        # transfrom training data
        # removing first column of encoded data to elude from dummy variable trap
        tempData = ohe.transform(df[[feature]])[:, 1:]

        # create Dataframe with columns as classes
        tempData = pd.DataFrame(tempData, columns=columns)
    else:
        tempData = df[[feature]]
    
    # create dataframe with all the label encoded categorical features along with hot encoding
    if i==0:
        encodedData = pd.DataFrame(data=tempData, columns=tempData.columns.values.tolist())
    else:
        encodedData = pd.concat([encodedData, tempData], axis=1)

In [None]:
# merge numerical features and categorical encoded features
df = df[numerical_features+['TARGET(PRICE_IN_LACS)', 'PRICE_IN_LACS']]
df = pd.concat([df, encodedData], axis=1)
df.info()

# Training Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn import metrics, preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [None]:
train_data = df.copy()
feature_cols = [feature for feature in train_data.columns if feature not in(['READY_TO_MOVE', 'ADDRESS', 'TARGET(PRICE_IN_LACS)', 
                                                                             'SQUARE_FT', 'PRICE_IN_LACS'])]
print('features used: ', feature_cols)

# RESCALING
#scaler = MinMaxScaler()
#scaler.fit(train_data[feature_cols])
#train_data[feature_cols] = scaler.transform(train_data[feature_cols])

In [None]:
X = train_data[feature_cols]
y = train_data['PRICE_IN_LACS']

validation_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, random_state=0, 
                                                    stratify=X[['RESALE', 'UNDER_CONSTRUCTION', 'RERA']])

y1 = train_data['TARGET(PRICE_IN_LACS)']
validation_size = 0.2
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=validation_size, random_state=0, 
                                                    stratify=X[['RESALE', 'UNDER_CONSTRUCTION', 'RERA']])

# Model: XGB

In [None]:
model = XGBRegressor( 
    n_estimators = 500,
    learning_rate=0.02, 
    #min_child_weight=3,
    #max_depth = 3,
    #subsample = 0.8,
    seed=7)


model = model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    #early_stopping_rounds=20,
    #eval_set=[(X_test, y_test)],
    verbose=False)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
# rmse on actual scaled values
print('RMSE Actual: ', np.sqrt(mean_squared_error(y_train1, np.exp(y_pred))))
print('RMSE Scaled Data: ', np.sqrt(mean_squared_error(y_train, y_pred)))
print('r2_score: ', round(r2_score(y_train1, np.exp(y_pred))*100, 2))

y_pred = model.predict(X_test)

print('Validation metrics...')
# rmse on actual scaled values
print('RMSE Actual: ', np.sqrt(mean_squared_error(y_test1,  np.exp(y_pred))))
print('RMSE Scaled Data: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('r2_score: ', round(r2_score(y_test1, np.exp(y_pred))*100, 2))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_pred[-150:]))), y=np.exp(y_pred[-150:]),
                         mode='lines',
                         name='Prediction'))
fig.add_trace(go.Scatter(x=list(range(len(y_test[-150:]))), y=y_test1[-150:],
                         mode='lines',
                         name='True value'))

fig.show()