In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
housing_df = pd.read_csv("data/housing.csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
#Check which columns has null values. 
housing_df.isnull().sum()

#housing_df[housing_df.isnull().any(1)]

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
#Remove the categorizal column so imputer can be applied. 
housing_num = housing_df.drop("ocean_proximity", axis=1)

In [5]:
#Add the transformed columns excluding the categorical one. 
housing_df.loc[:,housing_df.columns != 'ocean_proximity'] = SimpleImputer().fit_transform(housing_num)

housing_df.isnull().sum()
#np.all(np.isnan(housing_num))

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [6]:
#Create 3 new columns. 
housing_df["rooms_per_household"] = housing_df["total_rooms"] / housing_df["households"]
housing_df["population_per_household"] = housing_df["population"] / housing_df["households"]
housing_df["bedrooms_per_room"] = housing_df["total_bedrooms"] / housing_df["total_rooms"]
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,2.555556,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,2.109842,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,2.80226,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,2.547945,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,2.181467,0.172096


Standard Scaler

In [7]:
#Remove the label so scaler can be applied.
labels = housing_df["median_house_value"]
housing_df.drop("median_house_value", axis=1, inplace=True)
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,6.984127,2.555556,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,6.238137,2.109842,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,8.288136,2.80226,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,5.817352,2.547945,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,6.281853,2.181467,0.172096


In [8]:
#Remove the median income before applying the scaler. 
median_income = housing_df["median_income"]
housing_df.drop("median_income", axis=1, inplace=True)
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,NEAR BAY,6.984127,2.555556,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,NEAR BAY,6.238137,2.109842,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,NEAR BAY,8.288136,2.80226,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,NEAR BAY,5.817352,2.547945,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,NEAR BAY,6.281853,2.181467,0.172096


In [9]:
#Add the transformed columns excluding the categorical one. 
housing_num = housing_df.drop("ocean_proximity", axis=1)
housing_df.loc[:,housing_df.columns != 'ocean_proximity'] = StandardScaler().fit_transform(housing_num)
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,NEAR BAY,0.628559,-0.049597,-0.973116
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,NEAR BAY,0.327041,-0.092512,-0.841159
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,NEAR BAY,1.15562,-0.025843,-1.217873
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,NEAR BAY,0.156966,-0.050329,-0.430311
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,NEAR BAY,0.344711,-0.085616,-0.607519


In [10]:
housing_df["median_income"] = median_income
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,NEAR BAY,0.628559,-0.049597,-0.973116,8.3252
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,NEAR BAY,0.327041,-0.092512,-0.841159,8.3014
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,NEAR BAY,1.15562,-0.025843,-1.217873,7.2574
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,NEAR BAY,0.156966,-0.050329,-0.430311,5.6431
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,NEAR BAY,0.344711,-0.085616,-0.607519,3.8462


In [11]:
from sklearn.preprocessing import OneHotEncoder

# apply one hot enconding to the categorical column. 
housing_cat = housing_df[["ocean_proximity"]]
enconder = OneHotEncoder()
arr = enconder.fit_transform(housing_cat).toarray()
# add the encoded column back to the dataframe
housing_df["ocean_proximity"] = arr.tolist()
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.628559,-0.049597,-0.973116,8.3252
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.327041,-0.092512,-0.841159,8.3014
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.15562,-0.025843,-1.217873,7.2574
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.156966,-0.050329,-0.430311,5.6431
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.344711,-0.085616,-0.607519,3.8462


In [12]:
#Return the label to the dataset. 
housing_df["median_house_value"] = labels
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income,median_house_value
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.628559,-0.049597,-0.973116,8.3252,452600.0
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.327041,-0.092512,-0.841159,8.3014,358500.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.15562,-0.025843,-1.217873,7.2574,352100.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.156966,-0.050329,-0.430311,5.6431,341300.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.344711,-0.085616,-0.607519,3.8462,342200.0


In [13]:
#Create a new column with the income categories. This column will be used to stratisfy the shuffle split. 
housing_df["income_cat"] = pd.cut(housing_df["median_income"],
                                       bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                       labels=[1, 2, 3, 4, 5])
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income,median_house_value,income_cat
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.628559,-0.049597,-0.973116,8.3252,452600.0,5
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.327041,-0.092512,-0.841159,8.3014,358500.0,5
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.15562,-0.025843,-1.217873,7.2574,352100.0,5
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.156966,-0.050329,-0.430311,5.6431,341300.0,4
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,"[0.0, 0.0, 0.0, 1.0, 0.0]",0.344711,-0.085616,-0.607519,3.8462,342200.0,3


In [14]:
split = StratifiedShuffleSplit(n_splits=1,
                               test_size=.2,
                               random_state=42)

# create the train and test sets 
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [15]:
for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)

In [16]:
strat_train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income,median_house_value
12655,-0.943505,1.352188,0.028646,0.567133,0.618069,0.716618,0.540019,"[0.0, 1.0, 0.0, 0.0, 0.0]",0.022972,0.009426,-0.124628,2.1736,72100.0
15502,1.167818,-1.190065,-1.719432,1.230419,0.756409,0.52058,0.702187,"[0.0, 0.0, 0.0, 0.0, 1.0]",0.605503,-0.043035,-0.770667,6.3373,279600.0
2908,0.264392,-0.1226,1.220517,-0.466529,-0.543511,-0.669775,-0.521917,"[0.0, 1.0, 0.0, 0.0, 0.0]",-0.014416,-0.081585,-0.328018,2.875,82700.0
14053,1.217731,-1.349249,-0.368645,-0.347807,-0.04501,-0.46579,-0.043261,"[0.0, 0.0, 0.0, 0.0, 1.0]",-0.623606,-0.116644,0.889117,2.2264,112500.0
20496,0.434096,-0.632923,-0.13027,0.412657,0.257908,0.363397,0.210453,"[1.0, 0.0, 0.0, 0.0, 0.0]",0.269815,0.0093,-0.455626,4.4964,238300.0


In [17]:
print(strat_train_set.shape)
print(strat_test_set.shape)

(16512, 13)
(4128, 13)


# Random Forest Model

In [18]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_tst = strat_test_set.drop("median_house_value", axis=1)
housing_labels_tst = strat_test_set["median_house_value"].copy()

In [19]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_room,median_income
12655,-0.943505,1.352188,0.028646,0.567133,0.618069,0.716618,0.540019,"[0.0, 1.0, 0.0, 0.0, 0.0]",0.022972,0.009426,-0.124628,2.1736
15502,1.167818,-1.190065,-1.719432,1.230419,0.756409,0.520580,0.702187,"[0.0, 0.0, 0.0, 0.0, 1.0]",0.605503,-0.043035,-0.770667,6.3373
2908,0.264392,-0.122600,1.220517,-0.466529,-0.543511,-0.669775,-0.521917,"[0.0, 1.0, 0.0, 0.0, 0.0]",-0.014416,-0.081585,-0.328018,2.8750
14053,1.217731,-1.349249,-0.368645,-0.347807,-0.045010,-0.465790,-0.043261,"[0.0, 0.0, 0.0, 0.0, 1.0]",-0.623606,-0.116644,0.889117,2.2264
20496,0.434096,-0.632923,-0.130270,0.412657,0.257908,0.363397,0.210453,"[1.0, 0.0, 0.0, 0.0, 0.0]",0.269815,0.009300,-0.455626,4.4964
...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.247679,-1.218157,-1.163225,1.846950,1.653235,0.530294,1.311623,"[1.0, 0.0, 0.0, 0.0, 0.0]",0.496882,-0.100780,-0.426902,5.0900
12661,-0.923539,1.347506,-1.083767,2.413517,2.108804,2.952509,2.402331,"[0.0, 1.0, 0.0, 0.0, 0.0]",0.057771,0.028167,-0.494545,2.8139
19263,-1.572409,1.314733,1.538349,-0.884120,-0.886976,-0.854333,-0.856715,"[1.0, 0.0, 0.0, 0.0, 0.0]",-0.532932,-0.039271,0.291230,3.1797
19140,-1.562427,1.253869,-1.163225,0.238011,0.100486,-0.192044,0.003820,"[1.0, 0.0, 0.0, 0.0, 0.0]",0.350997,-0.063498,-0.439244,4.1964


In [20]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42)

In [24]:
housing_labels.info()

<class 'pandas.core.series.Series'>
Index: 16512 entries, 12655 to 19773
Series name: median_house_value
Non-Null Count  Dtype  
--------------  -----  
16512 non-null  float64
dtypes: float64(1)
memory usage: 258.0 KB


In [23]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 16512 non-null  float64
 1   latitude                  16512 non-null  float64
 2   housing_median_age        16512 non-null  float64
 3   total_rooms               16512 non-null  float64
 4   total_bedrooms            16512 non-null  float64
 5   population                16512 non-null  float64
 6   households                16512 non-null  float64
 7   ocean_proximity           16512 non-null  object 
 8   rooms_per_household       16512 non-null  float64
 9   population_per_household  16512 non-null  float64
 10  bedrooms_per_room         16512 non-null  float64
 11  median_income             16512 non-null  float64
dtypes: float64(11), object(1)
memory usage: 1.6+ MB


In [25]:
rfr.fit(housing.loc[:,housing.columns != 'ocean_proximity'], housing_labels)
#rfr.fit(housing, housing_labels)

In [26]:
y_pred = rfr.predict(housing_tst.loc[:,housing_tst.columns != 'ocean_proximity'])

In [27]:
from sklearn.metrics import r2_score

In [28]:
r2_score(housing_labels_tst, y_pred)

0.8118441716700857

# Pipelines

In [12]:
housing_df = pd.read_csv("data/housing.csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing_df["income_cat"] = pd.cut(housing_df["median_income"],
                                       bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                       labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1,
                               test_size=.2,
                               random_state=42)

# create the train and test sets 
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)

In [13]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline_fulltest = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        #('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

#housing_num_tr_fulltest = num_pipeline_fulltest.fit_transform(housing_num)

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline_fulltest = ColumnTransformer([
        ("num", num_pipeline_fulltest, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared_fulltest = full_pipeline_fulltest.fit_transform(housing)

In [17]:
housing_prepared_fulltest[0]

array([-0.94135046,  1.34743822,  0.02756357,  0.58477745,  0.64037127,
        0.73260236,  0.55628602, -0.8936472 ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ])