# Predicting house price
All the features given are useful.
However using longitude and latitude is a bit tricky. We should process that data either to compute distance from city center or distance from even a major city before we can use them as a feature. We leave them for now.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Ensure repeatability

In [None]:
#Set the `python` built-in pseudo-random generator at a fixed value for reproducibility
import random
seed_value=0
random.seed(seed_value)
np.random.seed(seed_value)

# Read data

In [None]:
house_data = pd.read_csv('../input/house-price-prediction-challenge/train.csv')
test_data= pd.read_csv('../input/house-price-prediction-challenge/test.csv')
house_data.shape,test_data.shape

In [None]:
house_data.head()

# Find the features we are interested in

In [None]:
#Focus on interested features
feature_list=["UNDER_CONSTRUCTION","RERA","BHK_NO.","SQUARE_FT","READY_TO_MOVE","RESALE","ADDRESS","BHK_OR_RK","TARGET(PRICE_IN_LACS)"] 
feature_list_minus_y=["UNDER_CONSTRUCTION","RERA","BHK_NO.","SQUARE_FT","READY_TO_MOVE","RESALE","ADDRESS","BHK_OR_RK"] 

house_data=house_data[feature_list]
house_test=test_data #Keep a copy of the test data with all the fields
test_data=test_data[feature_list_minus_y]

In [None]:
#let's check for missing values
import missingno as msno
msno.bar(house_data)

 **Good.There is no null values.**

Process data. We can extract City from address

In [None]:
import re
def getCity(addr):
    #s="Heikkinen, Miss. Laina"
    word_list = addr.split(',')
    return(word_list[-1])

In [None]:
house_data.head()

In [None]:
#Define a function for standardization

def standardizeData(X):
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    

    X['City']=X['ADDRESS'].apply(lambda x: getCity(x)) #The last word in address is City
    X=X.drop("ADDRESS",axis=1) #After extracting City name we are dropping address for now!!
        
    numerical_features=["UNDER_CONSTRUCTION","RERA","BHK_NO.","SQUARE_FT","READY_TO_MOVE","RESALE"]
    
    ss_scaler = MinMaxScaler(feature_range=(0,1))    
    X_ss = pd.DataFrame(data = X)
   
    X_ss[numerical_features] = ss_scaler.fit_transform(X_ss[numerical_features])
    
    
    X_ss=pd.get_dummies(X_ss, columns=["City","BHK_OR_RK"])
    return X_ss

In [None]:
#Call the function to Standarize numerical values and handle categorical values
pd.options.mode.chained_assignment = None  # default='warn'
house_data=standardizeData(house_data)
house_test=standardizeData(house_test)

In [None]:
house_data.head()

# First use only training data to see the accuracy

In [None]:
#Try in just training data
#from sklearn.model_selection import train_test_split

#mytrain, mytest = train_test_split(house_data, test_size=0.2)
#Prepare X and y
y_mytrain=house_data["TARGET(PRICE_IN_LACS)"]
X_mytrain=house_data.drop("TARGET(PRICE_IN_LACS)",axis=1)

#y_mytest=mytest["TARGET(PRICE_IN_LACS)"]
#X_mytest=mytest.drop("TARGET(PRICE_IN_LACS)",axis=1)

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error

xgbr = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, n_jobs=3)
xgbr.fit(X_mytrain, y_mytrain)
myprediction = xgbr.predict(X_mytrain)
#myprediction = xgbr.predict(X_mytest)
myprediction=np.clip(myprediction,0,None) #clip any negative values
print("meansquarederror: %.2f" %np.sqrt(mean_squared_log_error( y_mytrain, myprediction)))

#score = xgbr.score(X_mytrain, y_mytrain)  
#print("Training score: ", score)

#scores = cross_val_score(xgbr,X_mytrain, y_mytrain,cv=10)
#print("Mean cross-validation score: %.2f" % scores.mean())


# Train for entire data 

In [None]:
#Now train for the entire data set
#Prepare X and y
y_train=house_data["TARGET(PRICE_IN_LACS)"]
#y_test=mytest_data["TARGET(PRICE_IN_LACS)"]

#drop Survived clolum 
X_train=house_data.drop("TARGET(PRICE_IN_LACS)",axis=1)
#X_test=mytest_data.drop("TARGET(PRICE_IN_LACS)",axis=1)

#drop POSTED_BY clolum 
house_test=house_test.drop("POSTED_BY",axis=1)

# Process test data columns
Because of handling of categorical columns using get_dummies, there could be difference in columns between the train and test (data dependencies)

In [None]:

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( house_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    house_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
house_test = house_test[X_train.columns]
house_test.head()

# Train the model and predict

In [None]:
#from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, n_jobs=3)
model.fit(X_train, y_train)
testPrediction = model.predict(house_test)
testPrediction=np.clip(testPrediction,0,None) #clip any negative values




# Generate output file for submission

In [None]:
output = pd.DataFrame({'Id': house_test.index, 'SalePrice': testPrediction})
output.to_csv('submission.csv', index=False)
print('Done!')