# Import Library

In [234]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression,Ridge

# Import Data

In [187]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Rename

In [188]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [189]:
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,near_bay,near_bay,near_bay,near_bay,near_bay


In [190]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

# Fillna

In [191]:
df.total_bedrooms = pd.to_numeric(df.total_bedrooms, errors='coerce')            # sum values are null and be treated as string
df.total_bedrooms.fillna(0, inplace=True)
df.total_bedrooms.isna().sum()

0

# Create columns

In [192]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['households']
df['population_per_household'] = df['population']/df['households']
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
...,...,...,...,...,...
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,near_bay,near_bay,near_bay,near_bay,near_bay
rooms_per_household,6.984127,6.238137,8.288136,5.817352,6.281853
bedrooms_per_room,1.02381,0.97188,1.073446,1.073059,1.081081


# Question 1

In [193]:
# What is the most frequent observation (mode) for the column ocean_proximity?

df.ocean_proximity.mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

# Convert median_house_price to binary

In [194]:
mean_house = df.median_house_value.mean()
mean_house

206855.81690891474

In [195]:
df['above_average'] = np.where(df['median_house_value'] > mean_house,1,0)

In [196]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay,6.984127,1.02381,2.555556,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay,6.238137,0.97188,2.109842,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay,8.288136,1.073446,2.80226,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay,5.817352,1.073059,2.547945,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay,6.281853,1.081081,2.181467,1


# Split the data

In [197]:
# Split your data in train/val/test sets, with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

df_full_train, df_test = train_test_split(df,test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state=42)

In [198]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [199]:
# remove shuffle index for nicer view
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [200]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


df_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-119.67,34.43,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1h_ocean,3.922460,1.018717,3.754011
1,-118.32,33.74,24.0,6097.0,794.0,2248.0,806.0,10.1357,near_ocean,7.564516,0.985112,2.789082
2,-121.62,39.13,41.0,1317.0,309.0,856.0,337.0,1.6719,inland,3.908012,0.916914,2.540059
3,-118.63,34.24,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1h_ocean,5.201093,1.009836,2.059016
4,-122.30,37.52,38.0,2769.0,387.0,994.0,395.0,5.5902,near_ocean,7.010127,0.979747,2.516456
...,...,...,...,...,...,...,...,...,...,...,...,...
12379,-118.29,33.79,16.0,1867.0,571.0,951.0,498.0,3.3427,<1h_ocean,3.748996,1.146586,1.909639
12380,-121.34,38.04,16.0,3295.0,565.0,2279.0,576.0,3.6083,inland,5.720486,0.980903,3.956597
12381,-116.99,32.74,18.0,3341.0,611.0,1952.0,602.0,3.9844,<1h_ocean,5.549834,1.014950,3.242525
12382,-117.87,33.84,16.0,1545.0,354.0,730.0,350.0,4.5112,<1h_ocean,4.414286,1.011429,2.085714


# Question 2

In [201]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train.isnull().sum()

longitude                   0
latitude                    0
housing_median_age          0
total_rooms                 0
total_bedrooms              0
population                  0
households                  0
median_income               0
median_house_value          0
ocean_proximity             0
rooms_per_household         0
bedrooms_per_room           0
population_per_household    0
above_average               0
dtype: int64

## Correlation

In [202]:
df_train[['total_bedrooms','population']].corrwith(df_train.households).abs()

total_bedrooms    0.979399
population        0.906841
dtype: float64

In [203]:
df_train[['total_bedrooms','population_per_household']].corrwith(df_train.total_rooms).abs()

total_bedrooms              0.931546
population_per_household    0.029452
dtype: float64

# Question 3

## Mutual info score

In [204]:
df_full_train.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
above_average                 int32
dtype: object

In [205]:
mutual_info_score(df_full_train.above_average,df_full_train.ocean_proximity)

0.1019224615118327

# Question 4 - LR

## One-hot encoding.

In [206]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'rooms_per_household',
       'bedrooms_per_room', 'population_per_household']

In [207]:
categorical = ['ocean_proximity']

In [208]:
# Use Scikit-Learn to encode categorical features

dv = DictVectorizer(sparse=False)
dv

DictVectorizer(sparse=False)

In [209]:
train_dict = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)
X_train[1]

array([ 9.85111663e-01,  8.06000000e+02,  2.40000000e+01,  3.37400000e+01,
       -1.18320000e+02,  1.01357000e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  2.24800000e+03,
        2.78908189e+00,  7.56451613e+00,  7.94000000e+02,  6.09700000e+03])

In [210]:
val_dict = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)
X_val

array([[1.04395604e+00, 1.82000000e+02, 3.50000000e+01, ...,
        6.54395604e+00, 1.90000000e+02, 1.19100000e+03],
       [9.87179487e-01, 3.90000000e+02, 2.30000000e+01, ...,
        4.28717949e+00, 3.85000000e+02, 1.67200000e+03],
       [1.05663940e+00, 1.58900000e+03, 1.40000000e+01, ...,
        3.91692889e+00, 1.67900000e+03, 6.22400000e+03],
       ...,
       [9.03780069e-01, 5.82000000e+02, 3.10000000e+01, ...,
        5.29209622e+00, 5.26000000e+02, 3.08000000e+03],
       [1.05537975e+00, 6.32000000e+02, 3.40000000e+01, ...,
        5.87341772e+00, 6.67000000e+02, 3.71200000e+03],
       [9.72602740e-01, 2.92000000e+02, 3.20000000e+01, ...,
        5.40410959e+00, 2.84000000e+02, 1.57800000e+03]])

## Training logistic regression with Scikit-Learn

* Train a model with Scikit-Learn
* Apply it to the validation dataset
* Calculate the accuracy

In [211]:
y_train

array([1, 1, 0, ..., 1, 0, 0])

In [212]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [213]:
model.intercept_[0]

-0.14808582797638123

In [214]:
model.coef_[0].round(3)

array([ 5.980e-01,  4.000e-03,  3.600e-02,  1.330e-01,  9.400e-02,
        1.257e+00,  4.700e-01, -1.748e+00,  4.100e-02,  1.930e-01,
        8.950e-01, -2.000e-03,  1.100e-02, -1.120e-01,  1.000e-03,
       -0.000e+00])

## Calc acc on val

In [215]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.07061483, 0.18721027, 0.95253211, ..., 0.96511377, 0.85345041,
       0.4449008 ])

In [216]:
churn_decision = (y_pred >= 0.5)

In [217]:
original_acc = (y_val == churn_decision).mean()
original_acc

0.8374515503875969

# Question 5

In [218]:
#Let's find the least useful feature using the feature elimination technique.

## Eliminate total_rooms

In [219]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'median_income',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household',
       # 'total_rooms',
       'total_bedrooms', 
       'population', 
       'households', 
       ]
categorical = ['ocean_proximity']

In [220]:
# Use Scikit-Learn to encode categorical features

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean() - original_acc

-0.0007267441860464574

## Eliminate total_bedrooms

In [221]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'median_income',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household',
       'total_rooms',
       # 'total_bedrooms', 
       'population', 
       'households', 
       ]
categorical = ['ocean_proximity']

In [222]:
# Use Scikit-Learn to encode categorical features

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean() - original_acc

-0.0009689922480620172

## Eliminate population

In [223]:

numerical = ['longitude', 'latitude', 'housing_median_age', 'median_income',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household',
       'total_rooms',
       'total_bedrooms', 
    #    'population', 
       'households', 
       ]
categorical = ['ocean_proximity']
# Use Scikit-Learn to encode categorical features

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean() - original_acc

-0.007994186046511587

## Eliminate households

In [224]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'median_income',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household',
       'total_rooms',
       'total_bedrooms', 
       'population', 
       # 'households', 
       ]
categorical = ['ocean_proximity']

In [225]:
# Use Scikit-Learn to encode categorical features

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean() - original_acc

-0.0033914728682170603

# Question 6

alpha= [0, 0.01, 0.1, 1, 10]

In [267]:
np.random.seed(42)

n = len(df)

# Split your data in train/val/test sets, with 60%/20%/20% distribution.
n_val = int(0.2*n)
n_test = int(0.2*n)
n_train = n - (n_val + n_test)

idx = np.arange(n)

# Shuffle the initial dataset, use seed 42.
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
df_shuffled

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
20046,-119.01,36.06,25.0,1505.0,0.0,1392.0,359.0,1.6812,47700.0,inland,4.192201,0.000000,3.877437,0
3024,-119.46,35.14,30.0,2943.0,0.0,1565.0,584.0,2.5313,45800.0,inland,5.039384,0.000000,2.679795,0
15663,-122.44,37.80,52.0,3830.0,0.0,1310.0,963.0,3.4801,500001.0,near_bay,3.977155,0.000000,1.360332,1
20484,-118.72,34.28,17.0,3051.0,0.0,1705.0,495.0,5.7376,218600.0,<1h_ocean,6.163636,0.000000,3.444444,1
9814,-121.93,36.62,34.0,2351.0,0.0,1063.0,428.0,3.7250,278000.0,near_ocean,5.492991,0.000000,2.483645,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,229200.0,<1h_ocean,6.129032,0.926267,3.032258,1
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,97800.0,inland,6.868597,1.269488,3.904232,0
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,222100.0,<1h_ocean,3.986717,1.079696,3.332068,1
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1h_ocean,6.395349,1.067979,3.178891,1


In [268]:
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

# Apply the log transformation to the median_house_value variable using the np.log1p() function.
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

# Make sure that the target value ('median_house_value') is not in your dataframe.
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [273]:
df_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
20046,-119.01,36.06,25.0,1505.0,0.0,1392.0,359.0,1.6812,inland,4.192201,0.000000,3.877437
3024,-119.46,35.14,30.0,2943.0,0.0,1565.0,584.0,2.5313,inland,5.039384,0.000000,2.679795
15663,-122.44,37.80,52.0,3830.0,0.0,1310.0,963.0,3.4801,near_bay,3.977155,0.000000,1.360332
20484,-118.72,34.28,17.0,3051.0,0.0,1705.0,495.0,5.7376,<1h_ocean,6.163636,0.000000,3.444444
9814,-121.93,36.62,34.0,2351.0,0.0,1063.0,428.0,3.7250,near_ocean,5.492991,0.000000,2.483645
...,...,...,...,...,...,...,...,...,...,...,...,...
10292,-117.81,33.88,19.0,2265.0,283.0,904.0,279.0,9.2327,<1h_ocean,8.118280,1.014337,3.240143
16722,-120.68,35.48,15.0,2608.0,525.0,1351.0,502.0,2.7798,<1h_ocean,5.195219,1.045817,2.691235
11730,-120.91,38.98,13.0,7689.0,1415.0,3264.0,1198.0,3.6530,inland,6.418197,1.181135,2.724541
5993,-117.72,34.09,36.0,1473.0,328.0,785.0,299.0,3.2566,inland,4.926421,1.096990,2.625418


In [269]:
y_train

array([10.77270764, 10.7320612 , 13.12236738, ..., 11.89683321,
       11.93032573, 12.9487264 ])

In [270]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [271]:
list_emp = []
for r in [0, 0.01, 0.1, 1, 10]:

    model = Ridge(alpha=r, solver="sag", random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = rmse(y_val, y_pred)
    
    list_emp.append([r, score])

In [272]:
list_emp

[[0, 0.5736402705379842],
 [0.01, 0.5736402705381727],
 [0.1, 0.5736402705400104],
 [1, 0.5736402705581525],
 [10, 0.5736402707398084]]