# End-to-end Machine Learning Project
Chapter 2

_Pg 33_

### Function to fetch the data
_Pg 44_

In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

### Load data using Pandas

In [3]:
import os
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()
# housing.head()
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### Explore dataset (a bit)

In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
housing['median_house_value'].hist(bins=50, figsize=(8,5))
plt.show()

### Locate points (ArcGIS)

In [None]:
from arcgis.gis import GIS
my_gis = GIS()
# my_gis.map()

In [None]:
housingloc = housing[['longitude', 'latitude']]

In [None]:
housingloc.head()

In [None]:
housingloc[1000:2000].tail(5)

In [None]:
housingmap = my_gis.content.import_data(housingloc[1000:2000])

In [None]:
map1 = my_gis.map('California')

In [None]:
map1.add_layer(housingmap)

In [None]:
map1

### Locate points (Matplotlib)

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

x = housing['longitude']
y = housing['latitude']

plt.figure(figsize=(8,8))
plt.plot(x, y, 'ro')
plt.axis('scaled')
plt.draw()

### Locate points (Folium)
https://python-visualization.github.io/folium/

In [None]:
lomin = housing['longitude'].min()
lomax = housing['longitude'].max()
lamin = housing['latitude'].min()
lamax = housing['latitude'].max()
print(lomin, lomax, lamin, lamax)

In [None]:
import folium

In [None]:
m = folium.Map(
#     location=[45.372, -121.6972],
#     bounds = [(32.54, -124.35), (41.95, -114.31)],
#     zoom_start=5,
#     tiles='OpenStreetMap'
    tiles='Stamen Toner',
#     tiles='Stamen Terrain'
#     tiles = 'Stamen Watercolor'
#     tiles = 'Mapbox Bright'
#     tiles = 'Mapbox Control Room'
#     tiles = 'CartoDB positron'
#     tiles = 'CartoDB dark_matter'
    width='50%', height='50%',
)

In [None]:
m.fit_bounds([(lamin, lomin), (lamax, lomax)])

In [None]:
listofpoints = [[row['latitude'], row['longitude']] for index, row in housing[:2000].iterrows()]

In [None]:
listofpoints

In [None]:
for p in listofpoints:
    folium.Circle(location=p, radius=5, color='crimson', fill=True, fill_color='crimson').add_to(m)

In [None]:
for p in listofpoints:
    folium.Marker([p[0], p[1]]).add_to(m)

In [None]:
m

### Locate points (Geopandas)

In [None]:
import geopandas

In [None]:
cali = geopandas.read_file('datasets/shapefiles/California/GU_StateOrTerritory.shp')

In [None]:
cali.head()

In [None]:
cali.plot()

In [None]:
fig, ax = plt.subplots()
ax.set_aspect('equal')

plt.plot(x, y, 'ro')
cali.plot(ax=ax, color='#1f77b4', edgecolor='#bbbbbb')

plt.show()

### Create a Test Set
_Pg 49_

In [None]:
import numpy as np
# to make this notebook's output identical at every run
np.random.seed(42)

##### Split the train test

In [None]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
print('Train set: {} instances. Test set: {} instances.'.format(len(train_set), len(test_set)))

In [None]:
train_set.head()

In [None]:
test_set.head()

##### To ensure the train test will remain consistent across multiple runs

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

##### Create an identifier column

In [None]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

##### Using Scikit-learn  to split the train set

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

### Creating Income categories
_Pg 52_

Median Income is a continuous numerical attribute - we need to create categories

In [None]:
%matplotlib inline
housing['median_income'].hist()
# housing['median_income'].hist(bins=50)

In [None]:
# creating income categories
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [None]:
housing["income_cat"].hist()

In [None]:
housing[["income_cat"]].head()

##### Stratified sampling based on Income category

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set.head()

##### Income categories proportions in the test dataset:

In [None]:
# strat_test_set["income_cat"].value_counts()
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

##### Income categories proportions in the whole dataset:

In [None]:
# housing["income_cat"].value_counts()
housing["income_cat"].value_counts() / len(housing)

##### Remove the Income category so the data is back to its original state

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

### Discover and Visualize the Data
_Pg 53_

Make sure to put the test set aside and only work with the **training set**. Create a copy of it to play without harming the training set.

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
# s:size = population, c:colour = price
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="Population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()
plt.tight_layout()

### Looking for Correlations
_Pg  56_

Standard Correlation Coefficient : Pearson's **r**

In [None]:
corr_matrix = housing.corr()

In [None]:
# corr_matrix["median_house_value"]
corr_matrix["median_house_value"].sort_values(ascending=False)

Using Pandas ```scatter_matrix``` function

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12, 8))

Correlation scatterplot of median_house_value and median_income:

In [None]:
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)

### Experimenting with Attribute Combinations
pg 59

In [None]:
# Rooms per household more important than total rooms in a district
# Bedrooms per room more important than total bedrooms in a district
# Population per household
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']

In [None]:
corr_matrix = housing.corr()

In [None]:
# corr_matrix["median_house_value"]
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'rooms_per_household', 'bedrooms_per_room']
scatter_matrix(housing[attributes], figsize=(12, 8))