In [11]:
import pandas as pd
import numpy as np
from utils_housing import DataFetch

In [2]:
housing_path = DataFetch().fetch_housing_data()

In [3]:
housing_path

'/usr/src/ml/housing/data'

In [8]:
import os
csv_path = os.path.join(housing_path, "housing.csv")
housing_df = pd.read_csv(csv_path)
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [12]:
#Create a new column with the income categories. This column will be used to stratisfy the shuffle split. 
housing_df["income_cat"] = pd.cut(housing_df["median_income"],
                                       bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                       labels=[1, 2, 3, 4, 5])
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3


In [14]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,
                                       test_size=.2,
                                       random_state=42)

In [23]:
housing_df.shape

(20640, 11)

In [15]:
housing_df["income_cat"].value_counts()

3    7236
2    6581
4    3639
5    2362
1     822
Name: income_cat, dtype: int64

In [25]:
#generate train and test dataframes with the corresponding indexes. 
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

[12655 15502  2908 ... 19263 19140 19773]


In [21]:
print(strat_train_set["income_cat"].value_counts())
strat_test_set["income_cat"].value_counts()

3    5789
2    5265
4    2911
5    1890
1     657
Name: income_cat, dtype: int64


3    1447
2    1316
4     728
5     472
1     165
Name: income_cat, dtype: int64

In [26]:
#Remove the column income_cat in each dataframe. 
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [27]:
#split the features and target variable.
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [86]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [87]:
#housing_num.info()
#housing_num[housing_num.isnull()]
housing_num[housing_num.isnull().any(1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
1606,-122.08,37.88,26.0,2947.0,,825.0,626.0,2.9330
10915,-117.87,33.73,45.0,2264.0,,1970.0,499.0,3.4193
19150,-122.70,38.35,14.0,2313.0,,954.0,397.0,3.7813
4186,-118.23,34.13,48.0,1308.0,,835.0,294.0,4.2891
16885,-122.40,37.58,26.0,3281.0,,1145.0,480.0,6.3580
...,...,...,...,...,...,...,...,...
1350,-121.95,38.03,5.0,5526.0,,3207.0,1012.0,4.0767
4691,-118.37,34.07,50.0,2519.0,,1117.0,516.0,4.3667
9149,-118.50,34.46,17.0,10267.0,,4956.0,1483.0,5.5061
16757,-122.48,37.70,33.0,4492.0,,3477.0,1537.0,3.0546


In [88]:
housing_df["total_bedrooms"].median()

435.0

In [89]:
#Deal with null values using the median. 
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(strategy="median")
#housing_num["tot_bed"] = imputer.fit_transform(housing_num[["total_bedrooms"]])
housing_num = imputer.fit_transform(housing_num)

In [91]:
np.isnan(np.sum(housing_num))
#np.all(np.isnan(housing_num))

False

In [90]:
#housing_num[housing_num.isnull().any(1)]

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [99]:
# number of columns to generte the formulas for the new columns. 
rooms_ix = 3
bedrooms_ix = 4
population_ix = 5
households_ix = 6
rooms_per_household = housing_num[:, rooms_ix] / housing_num[:, households_ix]
population_per_household = housing_num[:, population_ix] / housing_num[:, households_ix]
bedrooms_per_room = housing_num[:, bedrooms_ix] / housing_num[:, rooms_ix]

In [103]:
housing_num = np.c_[housing_num,rooms_per_household,population_per_household,bedrooms_per_room]

In [104]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
housing_num = scaler.fit_transform(housing_num)

In [107]:
housing_num[0]

array([-0.94135046,  1.34743822,  0.02756357,  0.58477745,  0.64037127,
        0.73260236,  0.55628602, -0.8936472 ,  0.01739526,  0.00622264,
       -0.12112176])

In [110]:
num_attribs = list(housing_df)
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity',
 'income_cat']

In [112]:
from sklearn.preprocessing import OneHotEncoder
housing_cat = housing_df[["ocean_proximity"]]
enconder = OneHotEncoder()
housing_cat = enconder.fit_transform(housing_cat)

In [121]:
housing_cat.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [124]:
housing_num[0]
#housing_prepared_fulltest = 

numpy.ndarray