Description:

We work with the housing data set from California (1990) and try to find a model that best describes median house values. 


Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV






In [2]:
print('Pandas version: %s' %pd.__version__)
print('Numpy version: %s' %np.__version__)
print('Matplotlib version: %s' %mpl.__version__)
print('Sklearn version: %s' %sklearn.__version__)

Pandas version: 1.4.2
Numpy version: 1.21.5
Matplotlib version: 3.5.1
Sklearn version: 1.1.1


We import the housing dataset

In [3]:
housing = pd.read_csv("D:\Github\housing\housing.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Github\\housing\\housing.csv'

Exploratory data analysis

In [None]:
housing.head(10)

In [None]:
housing.shape

In [None]:
housing.columns

This reveals that there are 10 columns and 20, 640 rows. The column names are

'longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value', 'ocean_proximity'].


In [None]:
housing.info()

In [None]:
housing.describe()

Clearly, there are some values missing in the total_bedrooms category.

In [None]:
housing.hist(bins =50, figsize= (20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42) #splits data int0 80-20
#no chance of updates though. 

In [None]:
test_set.shape

In [None]:
train_set.shape

In [None]:
housing["income_cat"] =pd.cut(housing["median_income"],
                             bins = [0., 1.5, 3.0, 4.5, 6. , np.inf],
                             labels = [1, 2, 3, 4, 5]) #np.inf = floating pt. rep. of infinity

In [None]:
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index] #We use this so that the data gets shuffled

In [None]:
strat_train_set

In [None]:
strat_test_set

In [None]:
strat_train_set.info()

In [None]:
strat_test_set.info()

In [None]:
housing = strat_train_set.copy() #we just call the stratified shuffled training set housing

In [None]:
corr_matrix = housing.corr() #correlation matrix

In [None]:
corr_matrix["median_house_value"]

Median house value seems to be very strongly correlated to median_income. The following scatter plot captures
such a correlation.

In [None]:
housing.plot(kind = "scatter", x= "median_income", y = "median_house_value", alpha = 0.1)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis = 1)#drops the median_house_value column
housing_labels = strat_train_set["median_house_value"].copy() #and this? copies the set where? 

In [None]:
housing_labels

In [None]:
imputer = SimpleImputer(strategy = "median") 
#The missing values from data-set (here, it is total_bedrooms) are replaced by the median of that attribute

We drop ocean proximity, so that imputer can work on the columns with float attribute

In [None]:
housing_num = housing.drop("ocean_proximity", axis = 1)

In [None]:
housing_num 

In [None]:
imputer.fit(housing_num) #fit imputer housing_num

These are the values that the imputer comes up with.

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns = housing_num.columns, index = housing_num.index) #put all this back into a dataframe

In [None]:
housing_tr

In [None]:
housing_tr.info()

In [None]:
housing.ocean_proximity.value_counts()

This reveals that there are 5 different types of ocean proximity in the table. We use the One Hot Encoder to encode
this information into sparse matrices that capture this information.

In [None]:
cat_encoder = OneHotEncoder()

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat_1hot= cat_encoder.fit_transform(housing_cat)
housing_cat_1hot #all of this for one-hot encoding

In [None]:
housing_cat_1hot.toarray()

In [None]:
housing_cat_1hot

In [None]:
housing_tr

In [None]:
type(housing_cat_1hot)

In [None]:
housing_prep = housing_tr
housing_prep

In [None]:
housing_prep[cat_encoder.categories_[0]] = housing_cat_1hot.toarray()

In [None]:
housing_prep

In [None]:


lin_reg = LinearRegression() #what does this do? I have seen this a couple of times
lin_reg.fit(housing_prep, housing_labels)

Here, we measure this model's RMSE on the whole training set.


In [None]:
housing_predictions = lin_reg.predict(housing_prep)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

In [None]:
lin_rmse #this will turn out to be very (unacceptably) large

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prep, housing_labels)

In [None]:
housing_predictions1 = tree_reg.predict(housing_prep)
tree_mse = mean_squared_error(housing_labels, housing_predictions1)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse) #No error! Did this model overfit the data?

In [None]:
scores = cross_val_score(tree_reg, housing_prep, housing_labels, 
                        scoring = "neg_mean_squared_error", cv= 10) 
tree_rmse_scores = np.sqrt(-scores) 

In [None]:
print(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prep, housing_labels, 
                        scoring = "neg_mean_squared_error", cv= 10)
lin_rmse_scores = np.sqrt(-lin_scores) 

In [None]:
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42) #why this? my code doesn't run otherwise
forest_reg.fit(housing_prep, housing_labels)

In [None]:
housing_predictions2 = forest_reg.predict(housing_prep)
forest_mse = mean_squared_error(housing_labels, housing_predictions2)
forest_rmse = np.sqrt(forest_mse)


In [None]:
print(forest_rmse)