In [1]:
import numpy as np
import pandas as pd

### Prepare data

In [2]:
df_orig = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")

In [3]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity',]

In [4]:
df_orig.isnull().sum() # check if any null value exists

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
def pre_clean(df, feats, *, fill=0):
    _df = df.copy()
    _df = _df[feats]
    _df.columns = df.columns.str.lower().str.replace(' ', '_')
    for c in _df.columns[_df.dtypes == "object"].values:
        _df[c] = _df[c].str.lower().str.replace(' ', '_')
    _df.fillna(0)
    return _df

In [6]:
df = pre_clean(df_orig, features)

In [7]:
df["rooms_per_household"] = df.total_rooms / df.households
df["bedrooms_per_room"] = df.total_bedrooms / df.total_rooms
df["population_per_household"] = df.population / df.households

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,37.88,37.86,37.85,37.85,37.85
latitude,-122.23,-122.22,-122.24,-122.25,-122.25
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,near_bay,near_bay,near_bay,near_bay,near_bay


# Question 1: 

### What is the most frequent observation (mode) for the column `ocean_proximity`?

In [9]:
# df.ocean_proximity.value_counts() # check others
df.ocean_proximity.mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

# Question 2: 
- _Create the correlation matrix for the numerical features of your train dataset._
- _In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset._
### What are the two features that have the biggest correlation in this dataset?

In [10]:
numeric_col_features = df.columns[df.dtypes != 'object']
df[numeric_col_features].corrwith(df.median_house_value).sort_values(ascending=False)

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
latitude                   -0.045967
longitude                  -0.144160
bedrooms_per_room          -0.255880
dtype: float64

### Make `median_house_value` binary

- _We need to turn the `median_house_value` variable from numeric into binary_
- _Let's create a variable `above_average` which is `1` if the `median_house_value` is above it's mean value and `0` otherwise_


In [11]:
above_average = (df.median_house_value > df.median_house_value.mean()).astype('int')
df.median_house_value = above_average

### Split the data

- _Split your data in train/val/test sets, with 60%/20%/20% distribution._
- _Use Scikit-Learn for that (the train_test_split function) and set the seed to 42._
- _Make sure that the target value (median_house_value) is not in your dataframe._

In [12]:
from sklearn.model_selection import train_test_split
random_seed = 42
X_full_train, X_test = train_test_split(df, test_size=0.2, random_state=random_seed)
X_train, X_val = train_test_split(X_full_train, test_size=0.25, random_state=random_seed)

y_train = X_train.median_house_value
y_val = X_val.median_house_value
y_test = X_test.median_house_value

# Remove target value from X
for i in [X_train, X_val, X_test]:
    del i['median_house_value']    

# Question 3:
- _Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only._
- **What is the value of mutual information?**
- _Round it to 2 decimal digits using round(score, 2)_


In [13]:
X_train[df.columns[df.dtypes=='object']]

Unnamed: 0,ocean_proximity
17244,<1h_ocean
8817,near_ocean
19686,inland
3545,<1h_ocean
17019,near_ocean
...,...
5606,<1h_ocean
16339,inland
14965,<1h_ocean
11117,<1h_ocean


In [14]:
from sklearn.metrics import mutual_info_score
categorical_features = df.columns[df.dtypes=='object']
X_train[categorical_features].apply(lambda x, y: mutual_info_score(x, y), args=[y_train]).round(2)

ocean_proximity    0.1
dtype: float64

# Question 4:

- _Now let's train a logistic regression_
- _Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding._
- _Fit the model on the training dataset._
    - _To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:_
    - _model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)_
    - _Calculate the accuracy on the validation dataset and round it to 2 decimal digits._

In [15]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
X_train_dicts = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train_dv = dv.fit_transform(X_train_dicts)
lr_model = LogisticRegression(solver="liblinear", C=1.0,max_iter=1000,random_state=42)
lr_model.fit(X_train_dv, y_train)

In [16]:
X_val_dv = dv.fit_transform(X_val.to_dict(orient='records'))
y_pred = lr_model.predict_proba(X_val_dv)[:, 1]
y_pred_binary = (y_pred > y_pred.mean()).astype('int')

In [17]:
accuracy = 1 - ((y_pred_binary - y_val).mean())
print("Accuracy: ", accuracy)

Accuracy:  0.9561531007751938


# Question 5

- _Let's find the least useful feature using the feature elimination technique._
- _Train a model with all these features (using the same parameters as in Q4)._
- _Now exclude each feature from this set and train a model without it. Record the accuracy for each model._
- _For each feature, calculate the difference between the original accuracy and the accuracy without the feature._
- **Which of following feature has the smallest difference?**
    - **total_rooms**
    - **total_bedrooms**
    - **population**
    - **households**
    
> **Note**: the difference doesn't have to be positive

In [18]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

def vectorize(X):
    dv = DictVectorizer(sparse=False)
    return dv.fit_transform(X.to_dict(orient='records'))
    

def train_logistic(X, y):
    X_dv = vectorize(X)
    model = LogisticRegression(solver="liblinear", C=1.0,max_iter=1000,random_state=42)
    return model.fit(X_dv, y)
    
    
def get_accuracy_for_above_median_housing_value(model, x_real, y_real):
    y_pred = model.predict_proba(x_real)[:,1]
    y_pred_binary = (y_pred > y_pred.mean()).astype('int')
    error = (y_pred_binary - y_real).mean()
    return 1 - error


In [19]:
w_all_feature = train_logistic(X_train, y_train)
all_accuracy = get_accuracy_for_above_median_housing_value(w_all_feature, vectorize(X_val), y_val)
print(f"Original: {all_accuracy=}", end="\n")

features = list(df.columns)
features.remove('median_house_value')
diff = []
for f in features:
    feature_copy = features.copy()
    feature_copy.remove(f)
    X_train_f = X_train[feature_copy]
    X_val_f = X_val[feature_copy]
    w = train_logistic(X_train_f, y_train)
    accuracy = get_accuracy_for_above_median_housing_value(w, vectorize(X_val_f), y_val)
    _diff = all_accuracy - accuracy
    diff.append((f, _diff))
sorted(diff, key=lambda x: x[1])

Original: all_accuracy=0.9561531007751938


[('ocean_proximity', -0.01380813953488369),
 ('population', -0.003875968992248069),
 ('households', -0.002664728682170603),
 ('rooms_per_household', -0.0014534883720930258),
 ('population_per_household', -0.0009689922480620172),
 ('bedrooms_per_room', -0.0004844961240310086),
 ('housing_median_age', -0.00024224806201555982),
 ('longitude', 0.0002422480620154488),
 ('total_bedrooms', 0.0002422480620154488),
 ('total_rooms', 0.0004844961240310086),
 ('latitude', 0.004118217054263518),
 ('median_income', 0.05620155038759689)]

# Question 6

- _For this question, we'll see how to use a linear regression model from Scikit-Learn_
- _We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column._
- _Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data._
- _This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]_
- _Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits._
If there are multiple options, select the smallest alpha.

In [24]:
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
df_logi = pre_clean(df_orig.copy().fillna(0), df_orig.columns)
df_logi.median_house_value = np.log1p(df_logi.median_house_value) # apply the log transformation
X_full_train_logi, X_test_logi = train_test_split(df_logi, test_size=0.2)
X_train_logi, X_val_logi = train_test_split(X_full_train_logi, test_size=0.25)

y_train_logi = X_train_logi.median_house_value
y_val_logi = X_val_logi.median_house_value
y_test_logi = X_test_logi.median_house_value

del X_train_logi["median_house_value"]
del X_val_logi["median_house_value"]
del X_test_logi["median_house_value"]

In [32]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

X_train_logi_dv = DictVectorizer(sparse=False).fit_transform(X_train_logi.to_dict(orient="records"))
X_val_logi_dv = DictVectorizer(sparse=False).fit_transform(X_val_logi.to_dict(orient="records"))

alphas = [0, 0.01, 0.1, 1, 10]
for a in alphas:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    w = model.fit(X_train_logi_dv, y_train_logi)
    y_pred = w.predict(X_val_logi_dv)
    print(a, rmse(y_val_logi, y_pred).round(3), sep="\t")

0	0.527
0.01	0.527
0.1	0.527
1	0.527
10	0.527


---