# Classification using Logistic regression
Logisitic regression tries to fit an S-curve into the data
It classifies an item based on if the probability is above or below a threshold  
Here, we will try to build a classification to predict if the price of a house is above the median or below the median

In [1]:
import pandas as pd
import matplotlib.pyplot as py

In [2]:
data = pd.read_csv("datasets/california_housing.csv")
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5448,-118.42,34.01,29.0,1996.0,489.0,960.0,449.0,3.6611,344200.0,<1H OCEAN
16387,-121.27,38.02,32.0,342.0,58.0,138.0,52.0,2.9821,155000.0,INLAND
13482,-117.35,34.12,22.0,5640.0,889.0,3157.0,887.0,4.1581,126500.0,INLAND
20205,-119.2,34.28,22.0,2362.0,601.0,1127.0,499.0,3.4006,219400.0,NEAR OCEAN
1770,-122.35,37.95,45.0,2142.0,431.0,1318.0,431.0,3.0737,111600.0,NEAR BAY


## Data cleanup

In [3]:
data = data.dropna()
data.shape

(20433, 10)

In [6]:
data.loc[data["median_house_value"]==500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [7]:
data = data.drop(data.loc[data["median_house_value"]==500001].index)

In [8]:
data.shape

(19475, 10)

In [9]:
data["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
data = pd.get_dummies(data,columns=["ocean_proximity"])

In [11]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
3101,-117.7,35.64,8.0,2683.0,416.0,1154.0,399.0,5.8625,109400.0,0,1,0,0,0
5326,-118.45,34.05,28.0,801.0,399.0,936.0,406.0,2.1875,181300.0,1,0,0,0,0
2883,-118.98,35.39,22.0,1812.0,457.0,1592.0,420.0,1.4146,49100.0,0,1,0,0,0
20045,-119.02,36.06,41.0,2279.0,538.0,1908.0,511.0,1.3952,43100.0,0,1,0,0,0
6092,-117.85,34.11,27.0,1748.0,403.0,985.0,416.0,3.1133,180600.0,0,1,0,0,0


In [14]:
median = data["median_house_value"].median()
median

173800.0

In [17]:
data['above_median'] = (data['median_house_value'] - median) > 0

In [18]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
17537,-121.89,37.35,47.0,2879.0,631.0,2229.0,606.0,3.2599,183100.0,1,0,0,0,0,True
8994,-118.36,33.99,43.0,2657.0,548.0,1145.0,524.0,4.1375,287100.0,1,0,0,0,0,True
2106,-119.74,36.75,47.0,2236.0,418.0,1042.0,397.0,2.9545,59600.0,0,1,0,0,0,False
1418,-122.04,37.99,36.0,2765.0,495.0,1478.0,441.0,4.125,136200.0,0,0,0,1,0,False
12925,-121.3,38.66,21.0,3824.0,634.0,1818.0,600.0,3.712,139000.0,0,1,0,0,0,False


Let's setup the data

In [21]:
X = data.drop(["median_house_value","above_median"],axis=1)
Y = data["above_median"]

In [24]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

### Training the model

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [31]:
x_train.shape,x_test.shape

((15580, 13), (3895, 13))

In [32]:
y_train.shape,y_test.shape

((15580,), (3895,))

In [33]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train,y_train)
# solver : the algorithm under the hoood to solve the logistic problem
# liblinear : this is good choice when the data set is small 
# and classification is binary

In [38]:
print("Training score: " , logistic_model.score(x_train,y_train))

Training score:  0.8220795892169448


In [40]:
y_pred = logistic_model.predict(x_test)

## Evaluating the scores

In [50]:
df_pred_actual = pd.DataFrame({"predicted": y_pred,"actual":y_test})
df_pred_actual.sample(5)

Unnamed: 0,predicted,actual
19269,True,False
10775,True,True
8077,True,True
17917,True,True
19414,False,False


In [51]:
from sklearn.metrics import accuracy_score
print("Testing score : ", accuracy_score(y_test,y_pred))

Testing score :  0.8164313222079589
