In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Reading data

In [None]:
train=pd.read_csv('/kaggle/input/house-price-prediction-challenge/train.csv')
train.head()

Understanding data

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train['price'] = train['TARGET(PRICE_IN_LACS)']
train = train.drop('TARGET(PRICE_IN_LACS)', axis=1)

In [None]:
train.describe()

**EDA**

In [None]:
sns.countplot(train['POSTED_BY'])

In [None]:
plt.figure(figsize=(12,10))
plt.subplot(2,2,1)
sns.countplot(train['UNDER_CONSTRUCTION'])
plt.subplot(2,2,2)
sns.countplot(train['RERA'])
plt.subplot(2,2,3)
sns.countplot(train['READY_TO_MOVE'])
plt.subplot(2,2,4)
sns.countplot(train['RESALE'])
plt.show()

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.title('Ready to move against unapproved RERA')
sns.countplot('READY_TO_MOVE', data=train[train['RERA']==0])
plt.subplot(1,2,2)
plt.title('Resale against unapproved RERA')
sns.countplot('RESALE', data=train[train['RERA']==0])
plt.show()

In [None]:
sns.countplot(train['BHK_NO.'])

In [None]:
train = train[(train['BHK_NO.']<7)&(train['price']<15000)&(train['SQUARE_FT']<300000)]

In [None]:
train.shape

In [None]:
train['city'] = train['ADDRESS'].apply(lambda x: x.split(',')[1])
train = train.drop('ADDRESS', axis=1)
train.head()

In [None]:
train['city'].value_counts()[:10]

In [None]:
top_cities = train['city'].value_counts()[:10].keys().tolist()

In [None]:
cost_per_sqft = []
for city in top_cities:
    a = train['SQUARE_FT'][train['city']==city].mean()
    b = train['price'][train['city']==city].mean()
    cost_per_sqft.append(int((b/a)*100000))
cost_per_sqft

In [None]:
plt.figure(figsize=(12,6))
plt.title('Cost per sqft in top cities in India')
sns.barplot(x=top_cities, y=cost_per_sqft)
plt.show()

In [None]:
city_df = train[train['city'].isin(top_cities)]

In [None]:
plt.figure(figsize=(12,14))
plt.title('Average price of BHK based on city')
sns.barplot(x='price', y='city', data=city_df, hue='BHK_NO.')

Data Pre-processing

In [None]:
posted_by = pd.get_dummies(train['POSTED_BY'], drop_first=True)
types = pd.get_dummies(train['BHK_OR_RK'], drop_first=True, prefix='type')
train = pd.concat([train, posted_by], axis=1)
train = pd.concat([train, types], axis=1)
train = train.drop(['POSTED_BY', 'BHK_OR_RK', 'city', 'LONGITUDE', 'LATITUDE'], axis=1)
train.head()

In [None]:
train.shape

Separating data into X and y

In [None]:
X = train.drop('price', axis=1)
y = train['price']

Splitting data into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Scaling data using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

### Model Development

Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_train_lr = lr.predict(X_train)
pred_test_lr = lr.predict(X_test)

# Check score
from sklearn.metrics import r2_score
print('Linear regression prediction score')
print('Train prediction score:', r2_score(y_train,pred_train_lr))
print('Test prediction score:', r2_score(y_test, pred_test_lr))

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_train,hist=False,color='b',label='actual price')
sns.distplot(pred_train_lr,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_test,hist=False,color='b',label='actual price')
sns.distplot(pred_test_lr,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

Random Forest Regressor Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=10,random_state=42)
rf.fit(X_train,y_train)
pred_train_rf = rf.predict(X_train)
pred_test_rf = rf.predict(X_test)

# check score
print('Random Forest regressor prediction score')
print('Train prediction score:', r2_score(y_train,pred_train_rf))
print('Test prediction score:', r2_score(y_test, pred_test_rf))

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_train,hist=False,color='b',label='actual price')
sns.distplot(pred_train_rf,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_test,hist=False,color='b',label='actual price')
sns.distplot(pred_test_rf,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

Gradient Boosting Regressor Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=300,learning_rate=0.02,n_iter_no_change=5)
gbr.fit(X_train,y_train)
pred_train_gbr = gbr.predict(X_train)
pred_test_gbr = gbr.predict(X_test)

# check score
print('Gradient Boosting regressor prediction score')
print('Train prediction score:', r2_score(y_train,pred_train_gbr))
print('Test prediction score:', r2_score(y_test, pred_test_gbr))

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_train,hist=False,color='b',label='actual price')
sns.distplot(pred_train_gbr,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
ax = sns.distplot(y_test,hist=False,color='b',label='actual price')
sns.distplot(pred_test_gbr,hist=False,color='r',label='predicted price',ax=ax)
plt.show()

Predition on Given data

In [None]:
test = pd.read_csv('/kaggle/input/house-price-prediction-challenge/test.csv')
test.head()

In [None]:
print(test.shape)
test.info()

In [None]:
test['city'] = test['ADDRESS'].apply(lambda x: x.split(',')[1])
test = test.drop('ADDRESS', axis=1)

In [None]:
posted_by_test = pd.get_dummies(test['POSTED_BY'], drop_first=True)
types_test = pd.get_dummies(test['BHK_OR_RK'], drop_first=True, prefix='type')
test = pd.concat([test,posted_by_test], axis=1)
test = pd.concat([test,types_test], axis=1)
test = test.drop(['POSTED_BY', 'BHK_OR_RK', 'city', 'LONGITUDE', 'LATITUDE'], axis=1)
test.head()

In [None]:
test = scale.transform(test)
final_pred = gbr.predict(test)

In [None]:
submission = pd.DataFrame()
submission["TARGET(PRICE_IN_LACS)"] = final_pred
submission.to_csv('target price of house.csv', index=False)
submission