In [1]:
# Load libraries
import os            # operating system lib
import tarfile       # archives lib
import urllib        # url handling
import pandas as pd  # Pandas
import numpy as np   # NumPy

# Define variables
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Define function
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
# Call the function to fetch data
fetch_housing_data()

In [3]:
# Define function to load data using Panda
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
# Load fetched data
housing = load_housing_data()

In [5]:
# Set the seed for consistent results in the future
np.random.seed(42)

In [None]:
# Import sklearn library
from sklearn.model_selection import train_test_split

# Split the dataset into train and test using 20% threshold
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
# Check the results
train_set.info(), test_set.info()

In [68]:
# Derive target values from the train set
housing_y = train_set["median_house_value"].copy()

# Derive training values 
# Convert the categorical var to dummy and drop the categorical var
housing_x = pd.concat([train_set.drop("ocean_proximity", axis=1), pd.get_dummies(train_set["ocean_proximity"])], axis=1)
# Drop the fifth dummy var and the response variable
housing_x = housing_x.drop(["NEAR OCEAN","median_house_value"], axis=1)

In [71]:
# List of regressors including dummy variables
list(housing_x)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY']

In [73]:
#### Regression ####
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
# Fit linear regression
lin_reg.fit(housing_x, housing_y)

LinearRegression()

In [74]:
x_sample = housing_x.iloc[:5]
y_sample = housing_y.iloc[:5]

# Predict first 5 
print("Predictions(y-hat):", lin_reg.predict(x_sample))

Predictions: [188628.07724361 290379.8948687  250985.48476348 146878.07878194
 165789.41368924]


In [77]:
print("Observed y:", list(y_sample))

Observed: [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]


In [79]:
from sklearn.model_selection import cross_val_score

# Create a function to display RMSE for all 10 folds, average and the standard␣

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [86]:
#Do the same for linear regression model
lin_scores = cross_val_score(lin_reg, housing_x, housing_y,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [65581.45520648 71711.35784404 68143.02388491 66855.55244479
 69440.38017435 65640.36503235 65861.37192245 69898.33048393
 73117.94692191 69704.17693297]
Mean: 68595.3960848178
Standard deviation: 2496.5245371673072


In [15]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20640 non-null  float64
 1   latitude                  20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20433 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   median_house_value        20640 non-null  float64
 9   ocean_proximity           20640 non-null  object 
 10  rooms_per_household       20640 non-null  float64
 11  bedrooms_per_room         20433 non-null  float64
 12  population_per_household  20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


In [16]:
# Import sklearn library
from sklearn.model_selection import train_test_split

# Split the dataset into train and test using 20% threshold
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [17]:
# Derive target values from the train set
housing_y = train_set["median_house_value"].copy()

# Derive training values 
# Drop the categorical var
housing_x = train_set.drop(["ocean_proximity","median_house_value"], axis=1)


In [18]:
housing_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,4.002817,0.258269,3.994366
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,6.268421,0.180940,2.300000
...,...,...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,6.129032,0.151128,3.032258
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,6.868597,0.184825,3.904232
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,3.986717,0.270823,3.332068
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,6.395349,0.166993,3.178891


In [21]:
#### Regression ####
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
# Fit linear regression
lin_reg.fit(housing_x, housing_y)

LinearRegression()

In [22]:
from sklearn.model_selection import cross_val_score

# Create a function to display RMSE for all 10 folds, average and the standard␣

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [23]:
#Do the same for linear regression model
lin_scores = cross_val_score(lin_reg, housing_x, housing_y,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [65503.19601742 71822.27077741 67676.62308014 66640.50945839
 69192.93621885 66072.79696621 65711.8862025  69367.83077433
 73674.48726125 69735.41305941]
Mean: 68539.79498159104
Standard deviation: 2594.954397765056
