# Homework 3
### Data Preparation

In [43]:
import pandas as pd
import numpy as np

df = pd.read_csv('homework_data/housing.csv')

In [44]:
# Select only the features
df = df[[
    'latitude', 
    'longitude', 
    'housing_median_age', 
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity']]

# Fill in the null values with zero
df.fillna(0, inplace=True)

# Create new columns
df['rooms_per_household'] = df['total_rooms'].div(df['households'])
df['bedrooms_per_room'] = df['total_bedrooms'].div(df['total_rooms'])
df['population_per_household'] = df['population'].div(df['households'])

### Question 1

In [3]:
# Count the most frequent ocean_proximity
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### Question 2

In [4]:
# Correlation matrix of all numerical features
corr_df = df.drop(columns='ocean_proximity').corr()
print(corr_df)

                          latitude  longitude  housing_median_age  \
latitude                  1.000000  -0.924664            0.011173   
longitude                -0.924664   1.000000           -0.108197   
housing_median_age        0.011173  -0.108197            1.000000   
total_rooms              -0.036100   0.044568           -0.361262   
total_bedrooms           -0.065318   0.068082           -0.317063   
population               -0.108785   0.099773           -0.296244   
households               -0.071035   0.055310           -0.302916   
median_income            -0.079809  -0.015176           -0.119034   
median_house_value       -0.144160  -0.045967            0.105623   
rooms_per_household       0.106389  -0.027540           -0.153277   
bedrooms_per_room        -0.104112   0.084836            0.125396   
population_per_household  0.002366   0.002476            0.013191   

                          total_rooms  total_bedrooms  population  households  \
latitude             

Looking from the correlation matrix we see that total bedrooms and households have the highest correlation between each other.

In [45]:
# Create binary variable
df['above_average'] = df['median_house_value'].apply(lambda x: 1 if x > df.median_house_value.mean() else 0)

In [6]:
# Split data to 60/20/20 distribution
from sklearn.model_selection import train_test_split

df.drop(columns='median_house_value', inplace=True)
X = df.drop(columns='above_average')
y = df.above_average

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42)

### Question 3

In [7]:
# Calculate mutual information
from sklearn.metrics import mutual_info_score
mi = mutual_info_score(X_train.ocean_proximity, y_train)
print(round(mi, 2))

0.1


### Question 4

In [8]:
# Convert ocean_proximity using ohe
from sklearn.feature_extraction import DictVectorizer

# One and hot encode the ocean_proximity variable
train_dicts = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [9]:
from sklearn.linear_model import LogisticRegression

# Train the logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Create validation data
val_dicts = X_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)
pred = model.predict(X_val)
acc = round(np.mean(pred == y_val), 2)

In [10]:
print(acc)

0.83


### Question 5

In [41]:
# Create an accuracy dictionary and list of features to eliminate
acc_dict = {}
features = df.columns[:12]

# Loop against the training data
for col in features:
    
    # Drop the feature
    X = df.drop(columns=['above_average', col])
    y = df.above_average
    
    # Create training the data
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42)
    
    # One and hot encode the data
    train_dicts = X_train.to_dict(orient='records')
    val_dicts = X_val.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.fit_transform(val_dicts)
    
    # Train the model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Validate the model and record accuracy
    pred = model.predict(X_val)
    acc =np.mean(pred == y_val)
    acc_dict[col] = np.abs(acc-0.83)

In [42]:
min(acc_dict, key=acc_dict.get)

'total_bedrooms'

It looks like from the dictionary that total_bedrooms have the smallest feature difference out of the entire variable set.

### Question 6

In [50]:
from sklearn.linear_model import Ridge

# Create alpha dictionary
alpha_dict = {}

# Apply the logarithmic transformation to y-value
df['log_med_val'] = np.log1p(df['median_house_value'])

# Fit the ridge regression
for a in [0, 0.01, 0.1, 1, 10]:
    
    # Declare the training data
    X = df.drop(columns=['above_average', 'median_house_value', 'log_med_val'])
    y = df.log_med_val
    
    # Split the data
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42)
    
    # Convert categorical to one and hot encoding
    train_dicts = X_train.to_dict(orient='records')
    val_dicts = X_val.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.fit_transform(val_dicts)
    
    # Train the model
    model = Ridge(alpha=a, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    
    # Validate the model
    pred = model.predict(X_val)
    rmse = np.sqrt(np.sum((y_val - pred)**2)/len(y_val))
    alpha_dict[a] = rmse

In [52]:
print(min(alpha_dict, key=alpha_dict.get))

0


The best alpha for this regression is with 0 regularization parameter.