In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
features = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity'
]

In [5]:
df = df[features]

In [6]:
df.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

# Data preparation

In [7]:
df.fillna(0, inplace=True)
df['rooms_per_household'] = df.total_rooms / df.households
df['bedrooms_per_room'] = df.total_bedrooms  / df.total_rooms
df['population_per_household'] = df.population / df.households

In [8]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


# Q1: What is the most frequent observation (mode) for the column ocean_proximity?

In [9]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Make y binary

In [10]:
df["above_average"] = df.median_house_value > df.median_house_value.mean()

# Split the data

In [53]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average
y_val = df_val.above_average
y_test = df_test.above_average

y_reg_train = np.log1p(df_train.above_average)
y_reg_val = np.log1p(df_val.above_average)
y_reg_test = np.log1p(df_test.above_average)


del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

# Q2: What are the two features that have the biggest correlation in this dataset?

In [12]:
df_train.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [13]:
corr_abs = df_train.corr().abs() - np.eye(11)
corr_abs[corr_abs == corr_abs.max().max()]

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,,,,,,,,,,,
longitude,,,,,,,,,,,
housing_median_age,,,,,,,,,,,
total_rooms,,,,,,,,,,,
total_bedrooms,,,,,,,0.979399,,,,
population,,,,,,,,,,,
households,,,,,0.979399,,,,,,
median_income,,,,,,,,,,,
rooms_per_household,,,,,,,,,,,
bedrooms_per_room,,,,,,,,,,,


## max correlation is between households and total_bedrooms

# Q3 Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.

In [14]:
from sklearn.metrics import mutual_info_score

mutual_info_score(y_train, df_train.ocean_proximity)
 

0.10138385763624205

# Q4: Accuracy of LogisticRegression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [33]:
def prepare_X(df, vectorizer = None):
    row_dicts = df.to_dict(orient='records')
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        vectorizer.fit(row_dicts)
    
    return vectorizer.transform(row_dicts), vectorizer


In [37]:
X_train, vectorizer = prepare_X(df_train)
model.fit(X_train, y_train)

X_val,_ = prepare_X(df_val, vectorizer)

preds = model.predict(X_val)
original_accuracy = accuracy_score(preds, y_val)
round(original_accuracy,2)

0.84

# Q5: Which of following feature has the smallest difference?

In [42]:
scores = {}
for column in ['total_rooms', 'total_bedrooms', 'population', 'households']:
    X_train, vectorizer = prepare_X(df_train.drop(column, axis=1))
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    X_val,_ = prepare_X(df_val.drop(column, axis=1), vectorizer)

    preds = model.predict(X_val)
    scores[column] = original_accuracy - accuracy_score(preds, y_val)
    

In [43]:
scores

{'total_rooms': 0.0002422480620154488,
 'total_bedrooms': -0.00024224806201555982,
 'population': 0.01017441860465118,
 'households': 0.002664728682170492}

## total bedrooms has the smallest (even negative) difference

# Q6: Ridge Regression

In [48]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [47]:
from sklearn.linear_model import Ridge

In [60]:
scores = {}
for a in [0, 0.01, 0.1, 1, 100]:
    X_train, vectorizer = prepare_X(df_train)
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_reg_train)

    X_val,_ = prepare_X(df_val, vectorizer)

    preds = model.predict(X_val)
    scores[a] = round(rmse(preds, y_reg_val),3)
     

In [61]:
scores

{0: 0.314, 0.01: 0.314, 0.1: 0.314, 1: 0.314, 100: 0.314}