In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
housing_df = pd.read_csv("data/housing.csv")
housing_df.head()

In [None]:
#Check which columns has null values. 
housing_df.isnull().sum()

#housing_df[housing_df.isnull().any(1)]

In [None]:
#Remove the categorizal column so imputer can be applied. 
housing_num = housing_df.drop("ocean_proximity", axis=1)

In [None]:
#Add the transformed columns excluding the categorical one. 
housing_df.loc[:,housing_df.columns != 'ocean_proximity'] = SimpleImputer().fit_transform(housing_num)

housing_df.isnull().sum()
#np.all(np.isnan(housing_num))

In [None]:
#Create 3 new columns. 
housing_df["rooms_per_household"] = housing_df["total_rooms"] / housing_df["households"]
housing_df["population_per_household"] = housing_df["population"] / housing_df["households"]
housing_df["bedrooms_per_room"] = housing_df["total_bedrooms"] / housing_df["total_rooms"]
housing_df.head()

Standard Scaler

In [None]:
#Remove the label so scaler can be applied.
labels = housing_df["median_house_value"]
housing_df.drop("median_house_value", axis=1, inplace=True)
housing_df.head()

In [None]:
#Remove the median income before applying the scaler. 
median_income = housing_df["median_income"]
housing_df.drop("median_income", axis=1, inplace=True)
housing_df.head()

In [None]:
#Add the transformed columns excluding the categorical one. 
housing_num = housing_df.drop("ocean_proximity", axis=1)
housing_df.loc[:,housing_df.columns != 'ocean_proximity'] = StandardScaler().fit_transform(housing_num)
housing_df.head()

In [None]:
housing_df["median_income"] = median_income
housing_df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# apply one hot enconding to the categorical column. 
housing_cat = housing_df[["ocean_proximity"]]
enconder = OneHotEncoder()
arr = enconder.fit_transform(housing_cat).toarray()
# add the encoded column back to the dataframe
housing_df["ocean_proximity"] = arr.tolist()
housing_df.head()

In [None]:
#Return the label to the dataset. 
housing_df["median_house_value"] = labels
housing_df.head()

In [None]:
#Create a new column with the income categories. This column will be used to stratisfy the shuffle split. 
housing_df["income_cat"] = pd.cut(housing_df["median_income"],
                                       bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                       labels=[1, 2, 3, 4, 5])
housing_df.head()

In [None]:
split = StratifiedShuffleSplit(n_splits=1,
                               test_size=.2,
                               random_state=42)

# create the train and test sets 
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [None]:
for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)

In [None]:
strat_train_set.head()

In [None]:
print(strat_train_set.shape)
print(strat_test_set.shape)

# Random Forest Model

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_tst = strat_test_set.drop("median_house_value", axis=1)
housing_labels_tst = strat_test_set["median_house_value"].copy()

In [None]:
housing

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
#rfr.fit(housing.loc[:,housing.columns != 'ocean_proximity'], housing_labels)
rfr.fit(housing, housing_labels)

In [None]:
y_pred = rfr.predict(housing_tst.loc[:,housing_tst.columns != 'ocean_proximity'])

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(housing_labels_tst, y_pred)

# Pipelines

In [12]:
housing_df = pd.read_csv("data/housing.csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing_df["income_cat"] = pd.cut(housing_df["median_income"],
                                       bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                       labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1,
                               test_size=.2,
                               random_state=42)

# create the train and test sets 
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)

In [13]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline_fulltest = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        #('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

#housing_num_tr_fulltest = num_pipeline_fulltest.fit_transform(housing_num)

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline_fulltest = ColumnTransformer([
        ("num", num_pipeline_fulltest, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared_fulltest = full_pipeline_fulltest.fit_transform(housing)

In [17]:
housing_prepared_fulltest[0]

array([-0.94135046,  1.34743822,  0.02756357,  0.58477745,  0.64037127,
        0.73260236,  0.55628602, -0.8936472 ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ])