In [2]:
import pandas as pd

import matplotlib.pyplot as plt

In [3]:
housing_data = pd.read_csv('datasets/housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
58,-122.28,37.82,52.0,1387.0,341.0,1074.0,304.0,1.2171,80300.0,NEAR BAY
167,-122.25,37.8,29.0,2468.0,864.0,1335.0,773.0,1.3929,193800.0,NEAR BAY
1159,-121.53,39.52,30.0,1030.0,161.0,448.0,159.0,2.4821,73800.0,INLAND
19559,-120.98,37.6,36.0,1437.0,,1073.0,320.0,2.1779,58400.0,INLAND
15298,-117.35,33.16,10.0,1684.0,515.0,902.0,449.0,3.7891,206300.0,NEAR OCEAN


In [5]:
housing_data=housing_data.dropna()

In [6]:
housing_data = housing_data.drop(housing_data[housing_data['median_house_value'] == 500001].index)

In [8]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
#convert categorical values to numeric - one-hot encoding
housing_data = pd.get_dummies(housing_data, columns = ['ocean_proximity'])

In [11]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
8560,-118.41,33.92,22.0,2340.0,584.0,1141.0,554.0,4.5729,337500.0,1,0,0,0,0
18020,-121.95,37.24,32.0,1382.0,239.0,705.0,251.0,6.0957,405400.0,1,0,0,0,0
19621,-121.11,37.43,42.0,412.0,75.0,227.0,75.0,2.5,74200.0,0,1,0,0,0
351,-122.16,37.76,45.0,2299.0,514.0,1437.0,484.0,2.5122,95500.0,0,0,0,1,0
11224,-117.94,33.81,25.0,1731.0,482.0,1127.0,455.0,3.256,214300.0,1,0,0,0,0


In [12]:
median= housing_data['median_house_value'].median()
#predict if a given neighborhood will have house price above or below the median
median

173800.0

In [13]:
housing_data['above_median'] = (housing_data['median_house_value'] - median > 0)

In [14]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
1038,-120.93,38.5,15.0,1248.0,234.0,529.0,216.0,3.3393,107200.0,0,1,0,0,0,False
1830,-122.31,37.92,43.0,2116.0,407.0,900.0,361.0,4.1587,212200.0,0,0,0,1,0,True
19379,-120.87,37.77,9.0,4838.0,920.0,2460.0,923.0,3.5959,142700.0,0,1,0,0,0,False
9808,-121.95,36.6,32.0,3152.0,504.0,793.0,426.0,7.1198,469900.0,0,0,0,0,1,True
16189,-121.29,37.97,52.0,1610.0,480.0,1025.0,440.0,1.2962,110200.0,0,1,0,0,0,False


In [15]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [16]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [17]:
from sklearn.model_selection import train_test_split
#also shuffles the dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2)#80 20 split is typical

In [18]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [19]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [20]:
from sklearn.linear_model import LogisticRegression
#liblinear - algorithm to use to solve the regression - good for small data - binary output
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [22]:
print("Training score: ", logistic_model.score(x_train, y_train)) #accuracy of prediction for this model - how many predictions were correct

Training score:  0.8186136071887035


In [24]:
y_pred = logistic_model.predict(x_test)

In [25]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual':y_test})

df_pred_actual.head(10)

Unnamed: 0,predicted,actual
12975,False,False
2924,False,False
10179,True,True
765,True,False
20376,True,True
11058,False,True
19562,False,False
11034,True,True
14412,True,True
4576,True,True


In [26]:
from sklearn.metrics import accuracy_score

print("Testing score: ", accuracy_score(y_test, y_pred))

Testing score:  0.8246469833119384
