In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

## Question 1

In [32]:
df["ocean_proximity"].value_counts(normalize=True)

<1H OCEAN     0.442636
INLAND        0.317393
NEAR OCEAN    0.128779
NEAR BAY      0.110950
ISLAND        0.000242
Name: ocean_proximity, dtype: float64

- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (median_house_value) is not in your dataframe.

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [6]:
y_train = df_train.median_house_value
y_val = df_val.median_house_value

In [7]:
del df_train["median_house_value"]
del df_val["median_house_value"]

In [8]:
numerical = list(df_train.columns[df_train.dtypes != "object"])
categorical = list(df_train.columns[df_train.dtypes == "object"])

In [16]:
corr_values = df_train[numerical].corr().unstack().sort_values(ascending = False) 
corr_values[corr_values < 1]

households          total_bedrooms        0.979031
total_bedrooms      households            0.979031
total_rooms         total_bedrooms        0.932113
total_bedrooms      total_rooms           0.932113
households          total_rooms           0.921965
total_rooms         households            0.921965
households          population            0.906282
population          households            0.906282
total_bedrooms      population            0.876005
population          total_bedrooms        0.876005
total_rooms         population            0.852403
population          total_rooms           0.852403
total_rooms         median_income         0.200368
median_income       total_rooms           0.200368
population          longitude             0.091676
longitude           population            0.091676
                    total_bedrooms        0.063111
total_bedrooms      longitude             0.063111
longitude           households            0.048792
households          longitude  

In [17]:
import numpy as np
above_average = np.where(y_train > y_train.mean(), 1,0)
above_average_val = np.where(y_val > y_train.mean(), 1, 0)

## Question 3

In [18]:
from sklearn.metrics import mutual_info_score
round(mutual_info_score(df_train["ocean_proximity"], above_average),2)

0.1

## Question 4

In [21]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

In [33]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, above_average)

In [39]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)
accuracy = np.round(accuracy_score(above_average_val, y_pred),2)
print(accuracy)

0.83


## Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
    - total_rooms
    - total_bedrooms
    - population
    - households
> note: the difference doesn't have to be positive

In [43]:
features = ["total_rooms", "total_bedrooms", "population", "households"]
# baseline model 
train_dict = df_train[features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, above_average)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

orig_accuracy = accuracy_score(above_average_val, y_pred)


In [44]:
orig_accuracy

0.7104055790053221

In [45]:
for i in features:
    model_features = features.copy()
    model_features.remove(i)

    train_dict = df_train[model_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, above_average)

    val_dict = df_val[model_features].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(above_average_val, y_pred)

    print(i, orig_accuracy - score, score)

total_rooms 0.08478619930262432 0.6256193797026978
total_bedrooms 0.05175261515874474 0.6586529638465773
population 0.05487245366122229 0.6555331253440998
households 0.040190860708386844 0.6702147182969352


## Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

In [54]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train_log)
    
    y_pred = model.predict(X_val)
    
    # if squared is set to false returns RMSE
    score = mean_squared_error(y_val_log, y_pred, squared=False)
    
    print(f"The alpha is: {a}. The RMSE: {round(score,3)}")

The alpha is: 0. The RMSE: 0.548
The alpha is: 0.01. The RMSE: 0.548
The alpha is: 0.1. The RMSE: 0.548
The alpha is: 1. The RMSE: 0.548
The alpha is: 10. The RMSE: 0.548
