In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-26 15:01:52--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-09-26 15:01:53 (10.4 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [3]:
df_housing = pd.read_csv('housing.csv')
df_housing.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY


In [4]:
df_housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
df_housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Data preparation

In [7]:
df_housing['total_bedrooms'] = df_housing['total_bedrooms'].fillna(0)

In [9]:
# Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
df_housing['rooms_per_household'] = df_housing['total_bedrooms'] / df_housing['households'] 
# Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
df_housing['bedrooms_per_room'] = df_housing['total_bedrooms'] / df_housing['total_rooms']
# Create a new column population_per_household by dividing the column population by the column households from dataframe.
df_housing['population_per_household'] = df_housing['population'] / df_housing['households']

In [10]:
df_housing.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,1.02381,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.97188,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,1.073446,0.129516,2.80226


### Question 1

In [11]:
df_housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### Question 2

In [12]:
average = df_housing['median_house_value'].mean()
df_housing['above_average'] = (df_housing.median_house_value > average).astype(int)

In [17]:
df_full_train , df_test = train_test_split(df_housing, test_size = 0.2, random_state = 42) 
df_train, df_val  = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [19]:
df_train.corr().unstack().sort_values(ascending = False).to_frame()

Unnamed: 0,Unnamed: 1,0
longitude,longitude,1.000000
latitude,latitude,1.000000
population_per_household,population_per_household,1.000000
bedrooms_per_room,bedrooms_per_room,1.000000
rooms_per_household,rooms_per_household,1.000000
...,...,...
housing_median_age,total_rooms,-0.363522
median_income,bedrooms_per_room,-0.616617
bedrooms_per_room,median_income,-0.616617
longitude,latitude,-0.925005


### Question 3

In [20]:
average = df_full_train['median_house_value'].mean()
df_full_train['above_average'] = (df_full_train.median_house_value > average).astype(int)

In [21]:
from sklearn.metrics import mutual_info_score
round(mutual_info_score(df_full_train.ocean_proximity, df_full_train.above_average), 2)

0.1

### Question 4

In [22]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [23]:
num = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room',
       'population_per_household']
cat = ['ocean_proximity']

In [24]:
train_dict = df_train[num + cat].to_dict(orient = 'records')

  """Entry point for launching an IPython kernel.


In [26]:
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_dict)
X_train.shape

(12384, 16)

In [27]:
val_dict = df_val[num + cat].to_dict(orient = 'records')
X_val = dv.transform(val_dict)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [28]:
y_pred = model.predict(X_val)
accuracy = np.round(accuracy_score(y_val, y_pred),2)
accuracy

0.84

### Question 5

In [29]:
features = num + cat

In [30]:
origin_accuracy = accuracy

for c in features:
  sub = features.copy()
  sub.remove(c)

  train_dict = df_train[sub].to_dict(orient = 'records')

  dv = DictVectorizer(sparse = False)
  dv.fit(train_dict)

  X_train  = dv.transform(train_dict)

  model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  val_dict = df_val[sub].to_dict(orient = 'records')
  X_val = dv.transform(val_dict)
  y_pred = model.predict(X_val)
  score = accuracy_score(y_val, y_pred)

  print(f'Features: {c}, Diff: {origin_accuracy - score}, Score: {score}')

  import sys
  import sys


Features: longitude, Diff: 0.008846899224806193, Score: 0.8311531007751938


  import sys


Features: latitude, Diff: 0.006666666666666599, Score: 0.8333333333333334


  import sys


Features: housing_median_age, Diff: 0.010058139534883659, Score: 0.8299418604651163


  import sys


Features: total_rooms, Diff: 0.0020639534883720723, Score: 0.8379360465116279


  import sys


Features: total_bedrooms, Diff: 0.0032751937984495383, Score: 0.8367248062015504


  import sys


Features: population, Diff: 0.01296511627906971, Score: 0.8270348837209303


  import sys


Features: households, Diff: 0.00642441860465115, Score: 0.8335755813953488




Features: median_income, Diff: 0.056569767441860486, Score: 0.7834302325581395
Features: ocean_proximity, Diff: 0.0020639534883720723, Score: 0.8379360465116279


  import sys
  import sys


Features: rooms_per_household, Diff: 0.005697674418604581, Score: 0.8343023255813954


  import sys


Features: bedrooms_per_room, Diff: 0.004728682170542564, Score: 0.8352713178294574




Features: population_per_household, Diff: 0.0042441860465115555, Score: 0.8357558139534884
Features: ocean_proximity, Diff: 0.0020639534883720723, Score: 0.8379360465116279


### Question 6

In [32]:
data = df_housing.drop('above_average',axis = 1)

In [33]:
data['median_house_value']=np.log1p(data['median_house_value'])

In [34]:
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [35]:
train_dicts = df_train[num + cat].to_dict(orient ='records')
val_dicts = df_val[num + cat].to_dict(orient ='records')
test_dicts = df_test[num + cat].to_dict(orient ='records')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
dv_r = DictVectorizer(sparse = False)
dv_r.fit(train_dicts)
X_train = dv_r.transform(train_dicts)

dv_r.fit(val_dicts)
X_val = dv_r.transform(val_dicts)

In [40]:
alpha = [0, 0.01, 0.1, 1, 10]

for al in alpha:
  model = Ridge(alpha=al, solver="sag", random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_val)
  rmse_score = np.sqrt(mean_squared_error(y_val, y_pred))

  print(f'alpha: {al} rmse_score: {round(rmse_score, 10)}')

alpha: 0 rmse_score: 0.5242046072
alpha: 0.01 rmse_score: 0.5242046072
alpha: 0.1 rmse_score: 0.5242046073
alpha: 1 rmse_score: 0.5242046089
alpha: 10 rmse_score: 0.5242046249
