In [None]:
# Basics for loading and handling the data 
import numpy as np
import pandas as pd
import os
print(os.listdir("../input"))

# To plot figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# For modelling the data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

To load the data, let's take a look inside the 'housing' folder:

In [None]:
print(os.listdir("../input/housing"))

We'll be using 'housing.csv', so let's set that as our data path and take a look at the a brief description of the data set:

In [None]:
data_path = "../input/housing/housing.csv"
housing = pd.read_csv(data_path)
housing.info()

With the information displayed above, we can see that there are 20640 rows and 10 columns of data for each block of homes: longitude, latitude, median age, total rooms, total bedrooms, population, households, median income, median house value, and ocean proximity. 

What we're aiming to do is explain the median house value through the explanatory variables in the other columns. 

For our first look at the data, it'd be smart to plot some histograms and see how the values range across the column categories:

In [None]:
# Let's plot histograms of the data
housing.hist(bins=50, figsize=(20,15))
plt.show()

Right off the bat, we can see two issues which would probably hamper our results: the range of values recorded for the median age of the houses and for the median house value was capped at 50 years and $500,000 respectively. These caps in the data would distort the predictive accuracy of our model by leading it to believe that homes cannot be older than 50 years and that the maximum value for a home cannot be greater than half a million dollars.

The best way to deal with this type of issue is to eliminate these value counts from the data. Our model will be more accurate without them since there are surely homes older than 50 years and homes worth more than half a million dollars; these data points are simply erroneous and reflect arbitrary limits imposed on the data collection process.

Let's take a look at how many of these counts we'll be eliminating:

In [None]:
print(housing[housing['median_house_value']>450000]['median_house_value'].value_counts().head())
print("")
print("")

print(housing[housing['housing_median_age']>45]['housing_median_age'].value_counts().head())

On the basis of median housing value, we'll remove 965 entries.

On the basis of housing median age, we'll remove 1273 entries.

In [None]:
# Removes the skewed data
housing=housing.loc[housing['median_house_value']<500001.0,:]
housing=housing.loc[housing['housing_median_age']<52,:]

Next we check for missing data. Missing data can occur sometimes when an aspect of the observation is unavailable. Maybe there are missing records that make it impossible to determine the age of homes in a neighborhood or maybe we are unable to enter every home in the state to count the number of bedrooms. These are reasonable limitations.

To see if we have missing data for any variables, let's run a command to identify 

In [None]:
# Let's see what's missing 
sample_incomplete_rows = housing[housing.isnull().any(axis=1)]
missing=sample_incomplete_rows.head()
print(missing)

From the output above, we can see that we do have missing observations. Notably we are missing some entries for total bedrooms. The solution to this dilemma is - rather than throwing out all observation rows with missing data - to simply fill in missing variables with the dataset median for that variable if it is numerical (e.g., total bedrooms) or most frequent if the variable is categorical (e.g., ocean proximity). 

We will take care of this in the pipeline assemble to treat the data, but first let's create context variables for total rooms, total bedrooms, and population.

What we are interested in (in explaining value) is total rooms per household, total bedrooms per total rooms, and population per household.

The reasoning is that, intuitively, these modifications which place the variables within context are better at explaining the value of a home.

The code for creating these columns in the data is very basic:

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

Now we are ready to create and run a pipeline for the final treatment of the data. The process will proceed as follows:

1) Median House value will be separated from the explanatory variables.

2) Numerical data will have the imputer fill in median values for missing data.

3) Categorical data will have the imputer fill in most frequent values for missing data.

4) OneHotEncoder will then transform the categorical data into binary values so that the data can be run through a model.

5) The explanatory variables will be placed back together and ready for a regression.

In [None]:
# Pipeline to prepare the data
# num data will have imputer fill in median values for missing data
# cat data will have imputer fill in most frequesnt values in case we're missing ocean proximity entries
    #then OneHotEncoder will provide binary values for the cat data


#Separating predictors and labels
housing_labels = housing["median_house_value"].copy()  
housing_x = housing.drop("median_house_value", axis=1)




housing_cat= housing_x[["ocean_proximity"]]
housing_num = housing_x.select_dtypes(include=[np.number])


# Create a list of all numeric variable we need
num_attribs = list(housing_num)
# List of categorical variables
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        #('std_scaler', StandardScaler()),     # We don't need to standardize this data
    ])

cat_pipeline = Pipeline([
        ('imputermode', SimpleImputer(strategy="most_frequent")),
        ('lab_encoder',  OneHotEncoder()),
    ])



# Full pipeline
# Runs a pipeline for categorical and numerical features
full_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, num_attribs),
        ("cat_pipeline", cat_pipeline, cat_attribs),
    ])


housing_prepared = full_pipeline.fit_transform(housing_x)

Our data has been prepared and we can now run a regression using RandomForestRegressor from Scikit-learn. This is a great entry-level regression model for the purpose of introducing someone to this type of analysis. We'll also stick to some standard parameters (n_estimators=100, cv=3). 

In [None]:
# This box is for the regression


# Now we can test a RandomForestRegressor model

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)



forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(forest_rmse_scores)

There we have it. Our model can predict the median price of homes on a block with a mean squared error of $65,000. This isn't bad when you consider that an error of 255 results in a squared error of 65,025. It's actually very impressive. We could possibly achieve better and more trustworthy results using a combination of TensorFlow and train/test splitting of the dataset, but this regression as it stands is a great introduction to data preparation and machine learning using Scikit-learn. 