## 1. Import the Pandas Library

In [15]:
import numpy as np
import pandas as pd

## 2. Exploratory Data Analysis

In [16]:
# Read the data into a data frame
data = pd.read_csv('kc_house_data.csv')

In [17]:
# Check the number of data points in the data set
print(len(data))
# Check the number of features in the data set
print(len(data.columns))
# Check the data types available in this dataset
print(data.dtypes.unique())

21613
21
[dtype('int64') dtype('O') dtype('float64')]


We have one both numerical and categorical columns in this dataset.

## 3. Check Categorical Features
We can get the categorical column list using the below code

In [18]:
data.select_dtypes(include=['O']).columns.tolist()

['date']


We only have one categorical columns which is the date column that we will ignore. If you are interested to fine tune this model further, then you can preprocess this column. 


In [19]:
#view sample data
data.sample(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
16050,8068000440,20141226T000000,399000.0,3,1.75,1620,10000,1.5,0,0,...,6,1620,0,1918,0,98178,47.5091,-122.262,1880,10000
16078,626400020,20140918T000000,734000.0,4,2.5,3490,18521,2.0,0,0,...,9,3490,0,1990,0,98077,47.7406,-122.07,2850,18521
12131,1042500013,20140520T000000,219950.0,3,1.5,1650,9936,1.0,0,0,...,7,1090,560,1967,0,98003,47.3285,-122.328,1560,9890
10719,2475900565,20150309T000000,392500.0,3,1.0,1390,10500,1.5,0,0,...,6,1390,0,1940,0,98024,47.567,-121.893,1350,9800
18218,6798100661,20140616T000000,340000.0,3,2.5,1212,1174,3.0,0,0,...,7,1212,0,2004,0,98125,47.7145,-122.311,1256,1226


we don not need the id column also. So drop the column while fitting the data in ML model

## 4. Missing Value Columns List

In [20]:
#get the dimension of dataset
data.shape

(21613, 21)

In [21]:
#get record count for each variable
data.count()

id               21613
date             21613
price            21613
bedrooms         21613
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21613
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64

This dataset contains no missing value. So we can proceed further to make the machine learning model using the lightgbm

## 5. Split the Dataset

In [22]:
#separte x and y variable
X_data = data[['sqft_living','grade', 'sqft_above', 'sqft_living15','bathrooms','view',
                 'sqft_basement','lat','waterfront','yr_built','bedrooms']]
X=X_data.values
y = data.price.values

In [23]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [24]:
#get train and test dataset size
X_train.shape


(17290, 11)

In [25]:
X_test.shape

(4323, 11)

17k records for training and 4k records for testing model efficiency.

## 6. XGBoost Regression Model

In [26]:
# import xgboost library
import xgboost

In [27]:
# Let's try XGboost algorithm to see if we can get better results
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.08, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.75)

To know more about xgboost
https://xgboost.readthedocs.io/en/latest/python/python_intro.html
https://xgboost.readthedocs.io/en/latest/python/python_api.html

In [28]:
#predict values
y_pred = xgb.predict(X_test)

## 7. Computer Error Metrics

In [29]:
# Evaluating the Algorithm
from sklearn.metrics import explained_variance_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance',explained_variance_score(y_test,y_pred))
print('R Square :', metrics.r2_score(y_test, y_pred))  


Mean Absolute Error: 77681.5082372
Mean Squared Error: 17074811199.4
Root Mean Squared Error: 130670.621026
Explained Variance 0.856434286502
R Square : 0.856422820167
