In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mutual_info_score, accuracy_score

In [2]:
path = r'../data/housing.csv'
df = pd.read_csv(path)

In [3]:
# shape
df.shape

(20640, 10)

In [4]:
# top 5 records
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
# missing values
df.isnull().mean()

longitude             0.000000
latitude              0.000000
housing_median_age    0.000000
total_rooms           0.000000
total_bedrooms        0.010029
population            0.000000
households            0.000000
median_income         0.000000
median_house_value    0.000000
ocean_proximity       0.000000
dtype: float64

For this assignment use only the following features

- 'latitude'
- 'longitude'
- 'housing_median_age'
- 'total_rooms'
- 'total_bedrooms'
- 'population'
- 'households'
- 'median_income'
- 'median_house_value'
- 'ocean_proximity'

In [6]:
cols_to_use = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']


In [7]:
new_df = df[cols_to_use].copy()

### Data preparation

In [8]:
### Train test split
train_df, val_df = train_test_split(new_df, test_size=0.2, random_state=42)
train_df, test_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [9]:
train_df.shape, val_df.shape, test_df.shape

((12384, 10), (4128, 10), (4128, 10))

In [10]:
def create_new_fields(x):
    x['rooms_per_household'] = x['total_rooms'] / x ['households']
    x['bedrooms_per_room'] = x['total_bedrooms'] / x['total_rooms']
    x['population_per_household'] = x['population'] / x['households']

In [11]:
def data_prep(x, field, fill_method='zero', model='logreg'):
    x = x.copy().reset_index(drop=True)
    if fill_method == 'zero':
        x[field].fillna(0, inplace=True)
    else:
        raise Exception('Invalid fill method')
    create_new_fields(x)
    if model == 'logreg':
        mean_val = x['median_house_value'].mean() 
        y = x['median_house_value'].apply(lambda x: 1 if x > mean_val else 0)
    else:
        y = np.log1p(x['median_house_value'])
    del x['median_house_value']
    return x, y

### q1

 Most frequent value for ocean_proximity.


In [12]:
new_df['ocean_proximity'].value_counts()


<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### q2
Features with biggest correlation.

In [13]:
# prepare train, val and test splits
train_df, train_y = data_prep(train_df, 'total_bedrooms')
val_df, val_y = data_prep(val_df, 'total_bedrooms')
test_df, test_y = data_prep(test_df, 'total_bedrooms')

In [14]:
train_df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456


In [15]:
train_df.shape, val_df.shape, test_df.shape

((12384, 12), (4128, 12), (4128, 12))

In [16]:
cor_cols = ['total_bedrooms', 'total_rooms', 
            'population', 'households',
           'population_per_household']
res = train_df[cor_cols].corr().abs()
res.where(np.triu(np.ones(res.shape), k=1).astype(bool)).stack().sort_values(ascending=False)

total_bedrooms  households                  0.979399
                total_rooms                 0.931546
total_rooms     households                  0.921441
population      households                  0.906841
total_bedrooms  population                  0.877340
total_rooms     population                  0.853219
population      population_per_household    0.064998
total_bedrooms  population_per_household    0.034301
households      population_per_household    0.032522
total_rooms     population_per_household    0.029452
dtype: float64

### q3
Value of mutual information.

In [17]:
def mutual_info_house_score(series, y):
    return mutual_info_score(series, y)

In [18]:
cat_cols = train_df.select_dtypes(include='object').columns
scores = train_df[cat_cols].apply(mutual_info_house_score, args=(train_y,))
scores.sort_values(ascending=False)

ocean_proximity    0.101384
dtype: float64

### q4
Accuracy of the model


In [19]:
# one hot encode the categorical features
#dicts = train_df[cat_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
#dv.fit(dicts)

In [20]:
def one_hot_and_append_feat(x, cols, is_train=True):
    dicts = x[cols].to_dict(orient='records')
    if is_train:
        dv.fit(dicts)
    one_hot_df = pd.DataFrame(dv.transform(dicts), columns=dv.get_feature_names())
    x = x.reset_index(drop=True)
    x = pd.concat([x, one_hot_df], axis=1)
    return x.drop(['ocean_proximity'], axis=1)

In [21]:
train_df = one_hot_and_append_feat(train_df, cat_cols)
val_df = one_hot_and_append_feat(val_df, cat_cols)
test_df = one_hot_and_append_feat(test_df, cat_cols)

In [22]:
train_df.shape, val_df.shape, test_df.shape

((12384, 16), (4128, 16), (4128, 16))

In [23]:
train_df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity=<1H OCEAN,ocean_proximity=INLAND,ocean_proximity=ISLAND,ocean_proximity=NEAR BAY,ocean_proximity=NEAR OCEAN
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,3.92246,0.259714,3.754011,1.0,0.0,0.0,0.0,0.0
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,7.564516,0.130228,2.789082,0.0,0.0,0.0,0.0,1.0
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,3.908012,0.234624,2.540059,0.0,1.0,0.0,0.0,0.0
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,5.201093,0.194158,2.059016,1.0,0.0,0.0,0.0,0.0
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,7.010127,0.139762,2.516456,0.0,0.0,0.0,0.0,1.0


In [24]:
log_reg = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [25]:
log_reg.fit(train_df, train_y)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [26]:
# prediction on val dataset
val_pred = log_reg.predict_proba(val_df)[:, 1]
val_pred = val_pred > 0.5
orig_acc = round(accuracy_score(val_y, val_pred), 2)
orig_acc

0.83

### q5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?
    - total_rooms
    - total_bedrooms
    - population
    - households
- note: the difference doesn't have to be positive

In [27]:
def feat_elimination(x, feat):
    x = x.copy()
    x.drop([feat], axis=1, inplace=True)
    return x

In [28]:
def model(x, y, val_x, val_y):
    lg = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    lg.fit(x, y)
    val_pred = lg.predict_proba(val_x)[:, 1]
    val_pred = val_pred > 0.5
    acc = accuracy_score(val_y, val_pred)
    return abs(orig_acc - acc)

In [29]:
smallest_acc = []
feats = ['total_rooms', 'total_bedrooms', 'population', 'households']
for col in feats:
    X = feat_elimination(train_df, col)
    val_X = feat_elimination(val_df, col)
    acc = model(X, train_y, val_X, val_y)
    smallest_acc.append((col, acc))

In [30]:
sorted(dict(smallest_acc).items(), key=lambda x: x[1])[0]

('total_rooms', 0.001153100775193816)

### q6
Regression with Scikit-Learn. What's the best alpha?
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. 
- Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.


In [31]:
def rmse(y, y_pred):
    return np.sqrt(np.mean((y-y_pred)**2, axis=0))

In [36]:
train_df, val_df = train_test_split(new_df, test_size=0.2, random_state=42)
train_df, test_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [37]:
train_df, train_y = data_prep(train_df, 'total_bedrooms', model='reg')
val_df, val_y = data_prep(val_df, 'total_bedrooms', model='reg')
test_df, test_y = data_prep(test_df, 'total_bedrooms', model='reg')

In [38]:
train_df = one_hot_and_append_feat(train_df, cat_cols)
val_df = one_hot_and_append_feat(val_df, cat_cols)
test_df = one_hot_and_append_feat(test_df, cat_cols)

In [41]:
train_df.shape, val_df.shape, test_df.shape

((12384, 16), (4128, 16), (4128, 16))

In [53]:
alphas = [0, 0.01, 0.1, 1, 10]
rmses = []
for alpha in alphas:
    ridge = Ridge(alpha=alpha, solver="sag", random_state=42)
    ridge.fit(train_df, train_y)
    y_pred = ridge.predict(val_df)
    rmses.append((alpha, round(rmse(val_y, y_pred), 3)))

In [54]:
sorted(rmses, key=lambda x: (x[1], x[0]))

[(0, 0.566), (0.01, 0.566), (0.1, 0.566), (1, 0.566), (10, 0.566)]