In [1]:
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('datasets/housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12285,-116.89,33.86,2.0,6900.0,1238.0,1950.0,980.0,3.0417,146300.0,INLAND
2007,-119.79,36.75,33.0,3161.0,934.0,3530.0,846.0,1.123,46700.0,INLAND
19352,-122.95,38.73,37.0,1548.0,328.0,863.0,287.0,2.9792,151300.0,<1H OCEAN
11363,-117.94,33.74,24.0,4248.0,840.0,3118.0,798.0,4.2222,207200.0,<1H OCEAN
1492,-122.02,37.94,23.0,3516.0,661.0,1465.0,623.0,4.2569,213100.0,NEAR BAY


In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [6]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
#One-hot encoding
housing_data = pd.get_dummies(housing_data, columns = ['ocean_proximity'])

In [9]:
housing_data.shape

(19475, 14)

In [10]:
 housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
16578,-121.43,37.74,52.0,876.0,170.0,426.0,179.0,3.0865,119800.0,0,1,0,0,0
13418,-117.49,34.05,20.0,1483.0,249.0,660.0,194.0,3.9464,207700.0,0,1,0,0,0
9020,-118.8,34.15,9.0,1143.0,179.0,647.0,180.0,6.8474,356700.0,0,0,0,0,1
3994,-118.57,34.17,35.0,2072.0,318.0,908.0,342.0,6.0928,327300.0,1,0,0,0,0
12025,-117.47,33.94,34.0,559.0,139.0,532.0,137.0,3.0687,88500.0,0,1,0,0,0


In [11]:
median = housing_data['median_house_value'].median()

In [12]:
median

173800.0

In [13]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [14]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
3124,-117.82,35.03,30.0,2555.0,510.0,1347.0,467.0,3.3693,71800.0,0,1,0,0,0,False
14965,-116.99,32.74,18.0,3341.0,611.0,1952.0,602.0,3.9844,215300.0,1,0,0,0,0,True
5098,-118.31,33.97,52.0,1595.0,325.0,823.0,302.0,3.2188,124200.0,1,0,0,0,0,False
19628,-121.04,37.3,6.0,2657.0,486.0,1409.0,392.0,3.3824,115500.0,0,1,0,0,0,False
15555,-117.08,33.12,33.0,674.0,208.0,565.0,188.0,1.875,114300.0,1,0,0,0,0,False


In [15]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
y = housing_data['above_median']

In [16]:
#Binary Classification Problem - True if the house price is above the median house price or False if it is below it

In [17]:
X.columns

Index([u'longitude', u'latitude', u'housing_median_age', u'total_rooms',
       u'total_bedrooms', u'population', u'households', u'median_income',
       u'ocean_proximity_<1H OCEAN', u'ocean_proximity_INLAND',
       u'ocean_proximity_ISLAND', u'ocean_proximity_NEAR BAY',
       u'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
X_train.shape, X_test.shape

((15580, 13), (3895, 13))

In [20]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [21]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver = 'liblinear').fit(X_train, y_train)

In [22]:
print('Training score:' , logistic_model.score(X_train, y_train))

('Training score:', 0.8195121951219512)


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [24]:
#accuracy score
print 'accuracy score for the classification model: {0:.2f}'.format(accuracy_score(y_test, logistic_model.predict(X_test)))

accuracy score for the classification model: 0.83


In [25]:
#confusion matrix
print 'Confusion matrix for the classification model: \n{0}'.format(confusion_matrix(y_test, logistic_model.predict(X_test)))

Confusion matrix for the classification model: 
[[1621  351]
 [ 330 1593]]


In [26]:
#precision_score and recall_score
print 'precision score for the classification model: {0:.2f}'.format(precision_score(y_test, logistic_model.predict(X_test)))
print 'recall score for the classification model: {0:.2f}'.format(recall_score(y_test, logistic_model.predict(X_test)))

precision score for the classification model: 0.82
recall score for the classification model: 0.83


In [28]:
y_pred = logistic_model.predict(X_test)

In [29]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})

df_pred_actual.head(10)

Unnamed: 0,actual,predicted
17473,True,True
16576,False,False
17356,True,True
118,True,True
15044,True,True
8014,True,True
6533,False,False
13100,True,False
20375,True,True
10219,False,True
