# Quick and dirty implementation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [None]:
# upload the file to GitHub repo
housing_df = pd.read_csv('/content/drive/MyDrive/JTL312_Intro_to_ML/week_3/housing.csv')

In [None]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Drop all the records which have at least one null value

In [None]:
housing_df.dropna(inplace=True)

Drop non-numerical values

In [None]:
housing_df.drop(['ocean_proximity'], axis=1, inplace=True)

In [None]:
housing_df.sample(10, random_state=42)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
14425,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
16398,-118.48,34.16,32.0,2108.0,309.0,769.0,274.0,8.7172,500001.0
7721,-118.32,34.19,37.0,1335.0,249.0,485.0,240.0,4.1731,352100.0
1411,-118.28,33.75,18.0,393.0,189.0,429.0,188.0,1.8393,187500.0
1336,-122.47,37.75,45.0,2399.0,426.0,911.0,423.0,4.4312,361000.0
16275,-118.14,33.94,31.0,2841.0,774.0,1612.0,708.0,2.9205,196600.0
18023,-117.15,34.07,15.0,1852.0,316.0,906.0,298.0,5.3526,129800.0
10090,-117.49,33.93,26.0,2970.0,576.0,2156.0,558.0,3.9522,124600.0
18305,-121.98,37.36,35.0,1440.0,267.0,743.0,259.0,5.0866,254600.0
18745,-119.32,37.06,15.0,3111.0,651.0,276.0,107.0,5.1314,179200.0


Separate out X, y

In [None]:
y_serie = housing_df['median_house_value']
X_df = housing_df.drop(['median_house_value'], axis=1)

## Build a Linear Regression Model

In [None]:
lr_model = LinearRegression()

Why `random_state` is not a parameter for the `LinearRegressor()` class?

In [None]:
lr_model.fit(X_df, y_serie)

In [None]:
lr_model.coef_, lr_model.intercept_

(array([-4.27301205e+04, -4.25097369e+04,  1.15790031e+03, -8.24972507e+00,
         1.13820707e+02, -3.83855780e+01,  4.77013513e+01,  4.02975217e+04]),
 -3585395.747892311)

Check some attributes

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

attributes ending with and underscore -> estimated from fitted data

In [None]:
lr_model.rank_, lr_model.coef_, lr_model.intercept_

(8,
 array([-4.27301205e+04, -4.25097369e+04,  1.15790031e+03, -8.24972507e+00,
         1.13820707e+02, -3.83855780e+01,  4.77013513e+01,  4.02975217e+04]),
 -3585395.747892311)

In [None]:
lr_model.feature_names_in_, X_df.shape

(array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income'],
       dtype=object),
 (20433, 8))

## Evaluate the model

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
yhat_serie = lr_model.predict(X_df)

In [None]:
lr_rmse = mse(y_serie, yhat_serie, squared=False)
lr_rmse

69556.14839566678

In [None]:
y_mean, y_std = y_serie.mean(), y_serie.std()

In [None]:
y_mean, y_std

(206864.41315519012, 115435.6670985836)

In [None]:
lr_rmse / y_std

0.6025533541229064

In [None]:
yhat_serie.mean(), yhat_serie.std()

(206864.4131551898, 92123.19636363993)

Happy with the estimation?

## Performance on unseen data: Splitting data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_df, X_test_df, y_train_serie, y_test_serie = train_test_split(X_df, y_serie, train_size=0.90, random_state=42)

In [None]:
len(X_train_df), len(X_test_df), len(X_train_df) / len(X_df)

(18389, 2044, 0.8999657416923604)

Read more about test_train_split on the documentation

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

Read more about stratified sampling. What is the effect of using stratified and unstratified sampling method? If there's an effect, rationalize why.

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model.fit(X_train_df, y_train_serie)

In [None]:
yhat_test_serie = lr_model.predict(X_test_df)

In [None]:
lr_test_rmse = mse(y_test_serie, yhat_test_serie, squared=False)
lr_test_rmse

69858.222591068

In [None]:
lr_train_rmse = mse(y_train_serie, lr_model.predict(X_train_df), squared=False)
lr_train_rmse

69525.79064626202

What is the interpretaion of the last two results? Why rmse_test and rmse_train almost identical?

## Performance on unseen data, part 2: Data splitting using Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
lr_model = LinearRegression()

In [None]:
-cross_val_score(lr_model, X_df, y_serie, scoring="neg_root_mean_squared_error", cv=10)

array([71478.60497092, 72064.17636317, 69934.78240462, 70084.93672224,
       69688.23029563, 68550.9871822 , 69928.85771359, 66542.3638184 ,
       69421.23052088, 68834.2052676 ])

In [None]:
X_train_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
5089,-122.60,38.24,16.0,1410.0,209.0,741.0,229.0,4.7250
16988,-118.15,34.08,44.0,1053.0,251.0,941.0,256.0,3.1250
9662,-118.26,34.15,18.0,2481.0,756.0,1763.0,675.0,2.8088
18554,-121.85,37.33,19.0,735.0,158.0,597.0,134.0,4.5208
12092,-117.94,33.88,46.0,1747.0,312.0,770.0,296.0,5.4217
...,...,...,...,...,...,...,...,...
11396,-117.68,34.15,24.0,1033.0,189.0,486.0,204.0,4.1719
12084,-118.26,33.99,36.0,2016.0,505.0,1807.0,464.0,1.6901
5439,-117.90,34.11,23.0,4776.0,1316.0,4797.0,1187.0,2.1667
866,-118.36,33.88,28.0,1313.0,319.0,827.0,308.0,2.6500


Out-of-the-box machine learning tools don't provide the prediciton accuracy we desire. Assuming no other ML algorithms are available to us, how do we proceed?

Ans: