In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train= pd.read_csv('/kaggle/input/house-price-prediction-challenge/train.csv')
train.head()

In [None]:
train.info()

In [None]:
test= pd.read_csv('/kaggle/input/house-price-prediction-challenge/test.csv')
test.head()

In [None]:
train['POSTED_BY'].value_counts()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

# EDA

In [None]:
i=1
plt.figure(figsize=(20,15))
for col in ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'READY_TO_MOVE', 'RESALE']:
    plt.subplot(4,2,i)
    sns.countplot(train[col])
    i+=1

In [None]:
train['BHK_OR_RK'].value_counts()

In [None]:
test['BHK_OR_RK'].value_counts()

In [None]:
train.drop(['ADDRESS', 'BHK_OR_RK'], axis=1, inplace=True )
test.drop(['ADDRESS', 'BHK_OR_RK'], axis=1, inplace=True )

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sns.scatterplot(x= 'LATITUDE', y= 'LONGITUDE', hue= 'TARGET(PRICE_IN_LACS)', data=train)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

# Preprocess

**Ready to Move and Under Construction are completely co-related. So we drop one of them to prevent overfitting**

In [None]:
train.drop('READY_TO_MOVE', axis=1, inplace=True)
test.drop('READY_TO_MOVE', axis=1, inplace=True)

In [None]:
train['POSTED_BY']= train['POSTED_BY'].replace({'Owner':0, 'Dealer':1, 'Builder':2})
test['POSTED_BY']= test['POSTED_BY'].replace({'Owner':0, 'Dealer':1, 'Builder':2})

In [None]:
train.head()

In [None]:
X= train.drop('TARGET(PRICE_IN_LACS)', axis=1)
y= train['TARGET(PRICE_IN_LACS)']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2)

In [None]:
X_train.head()

In [None]:
X_test.head()

# Creating Models

**We use several models and then we will select the best model**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [None]:
key= ['RandomForestRegressor', 'DecisionTreeRegressor', 'XGBRegressor', 'CatBoostRegressor']
value= [RandomForestRegressor(), DecisionTreeRegressor(), XGBRegressor(), CatBoostRegressor()]

models= dict(zip(key,value))


In [None]:
models

In [None]:

for key, value in models.items():
    value.fit(X_train, y_train)
    train_pred= value.predict(X_train)
    test_pred= value.predict(X_test)
    train_error= np.sqrt(mean_squared_error(y_train, train_pred))
    test_error= np.sqrt(mean_squared_error(y_test, test_pred))
    print(f"{key}: \n ")
    print(f"Training error: {train_error} \n")
    print(f"Testing error: {test_error} \n")
    print('\n')
    

# **As we can see, RandomForestRegressor is performing best. It is surprising that it performs better CatBoost and XGBoost**

In [None]:
test_pred[:5]

In [None]:
y_test[:5]

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rfr= RandomForestRegressor(criterion='mse', random_state=42)
params=dict(n_estimators= [10,50, 100,200],
       max_depth= [10,50,100,200],
       min_samples_split = [1,2,5,10],
       min_samples_leaf=[1,5,10,20])

grid= RandomizedSearchCV(rfr, param_distributions=params, cv=5)


In [None]:
grid.fit(X,y)

In [None]:
grid.best_estimator_

In [None]:
pred= grid.best_estimator_.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred))

# The score has improved significantly after hyperparameter tuning

In [None]:
final_predictions= grid.best_estimator_.predict(test)

In [None]:
sample= pd.read_csv('/kaggle/input/house-price-prediction-challenge/sample_submission.csv')
sample.head()

In [None]:
sample.shape

In [None]:
final_predictions.shape

In [None]:
sample['TARGET(PRICE_IN_LACS)']= final_predictions
sample.head()

In [None]:
sample.to_csv('predictions.csv', index=False)

# Upvote and Comment if you liked my notebook :)