In [2]:
def estimate_home_value(size_in_sqft, number_of_bedrooms):

    # Assume all homes are worth at least $50,000
    value = 50000

    # Adjust the value estimate based on the size of the house
    value = value + (size_in_sqft * 92.1)

    # Adjust the value estimate based on the number of bedrooms
    value = value + (number_of_bedrooms * 10000)

    return value

# Estimate the value of our house:
# - 5 bedrooms
# - 3800 sq ft
# Actual value: $450,000

value = estimate_home_value(3800, 5)

print("Estimated valued:")
print(value)

Estimated valued:
449980.0


In [1]:
import pandas
import webbrowser
import os

# Read the dataset into a data table using Pandas
data_table = pandas.read_csv("ml_house_data_set.csv")

# Create a web page view of the data for easy viewing
html = data_table[0:100].to_html()

# Save the html to a temporary file
with open("data.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))

True

In [3]:
import pandas as pd
# Varies with sklearn version
from sklearn.cross_validation import train_test_split
#from sklearn.model_selection import train_test_split

from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

# Remove the sale price from the feature data
del features_df['sale_price']

# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

X,y



(array([[1978L, 1L, 4L, ..., 0, 0, 0],
        [1958L, 1L, 3L, ..., 0, 0, 0],
        [2002L, 1L, 3L, ..., 0, 0, 0],
        ..., 
        [1983L, 1L, 1L, ..., 0, 0, 0],
        [1981L, 1L, 3L, ..., 0, 0, 0],
        [1980L, 1L, 3L, ..., 0, 0, 0]], dtype=object),
 array([  270897.,   302404.,  2519996., ...,    98280.,    98278.,
          186480.]))

In [4]:
features_df.to_csv('SampleX.csv')
df['sale_price'].to_csv('SampleY.csv')

In [5]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train, X_test, y_train, y_test 

(array([[2013L, 2L, 5L, ..., 0, 0, 0],
        [1979L, 1L, 3L, ..., 0, 0, 0],
        [1959L, 1L, 3L, ..., 0, 0, 0],
        ..., 
        [2007L, 2L, 5L, ..., 0, 0, 0],
        [1967L, 1L, 2L, ..., 0, 0, 0],
        [2005L, 1L, 2L, ..., 0, 0, 0]], dtype=object),
 array([[1990L, 1L, 4L, ..., 0, 0, 0],
        [2006L, 1L, 2L, ..., 0, 0, 0],
        [1973L, 1L, 2L, ..., 0, 1, 0],
        ..., 
        [1973L, 1L, 2L, ..., 0, 0, 0],
        [1987L, 2L, 3L, ..., 0, 0, 0],
        [1976L, 1L, 4L, ..., 0, 0, 0]], dtype=object),
 array([ 538023.,  932398.,  214198., ...,  338938.,  378004.,  304924.]),
 array([ 264598.,  338943.,  146159., ...,  173876.,  463676.,   88198.]))

In [6]:
# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber',
    random_state=0
)
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=6,
             max_features=0.1, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1000, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False)

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [7]:
# Save the trained model to a file so we can use it in other programs. Create a folder named model in the directory.
joblib.dump(model, 'model/trained_house_classifier_model.pkl')

['model/trained_house_classifier_model.pkl']

In [8]:
# Find the error rate on the training set
mae = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mae)

# Find the error rate on the test set
mae = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mae)


Training Set Mean Absolute Error: 48187.1991
Test Set Mean Absolute Error: 59174.2563


In [9]:
import numpy as np
from sklearn.externals import joblib

# These are the feature labels from our data set
feature_labels = np.array(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms', 'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool', 'has_central_heating', 'has_central_cooling', 'garage_type_attached', 'garage_type_detached', 'garage_type_none', 'city_Amystad', 'city_Brownport', 'city_Chadstad', 'city_Clarkberg', 'city_Coletown', 'city_Davidfort', 'city_Davidtown', 'city_East Amychester', 'city_East Janiceville', 'city_East Justin', 'city_East Lucas', 'city_Fosterberg', 'city_Hallfort', 'city_Jeffreyhaven', 'city_Jenniferberg', 'city_Joshuafurt', 'city_Julieberg', 'city_Justinport', 'city_Lake Carolyn', 'city_Lake Christinaport', 'city_Lake Dariusborough', 'city_Lake Jack', 'city_Lake Jennifer', 'city_Leahview', 'city_Lewishaven', 'city_Martinezfort', 'city_Morrisport', 'city_New Michele', 'city_New Robinton', 'city_North Erinville', 'city_Port Adamtown', 'city_Port Andrealand', 'city_Port Daniel', 'city_Port Jonathanborough', 'city_Richardport', 'city_Rickytown', 'city_Scottberg', 'city_South Anthony', 'city_South Stevenfurt', 'city_Toddshire', 'city_Wendybury', 'city_West Ann', 'city_West Brittanyview', 'city_West Gerald', 'city_West Gregoryview', 'city_West Lydia', 'city_West Terrence'])

# Load the trained model created with train_model.py
model = joblib.load('model/trained_house_classifier_model.pkl')

# Create a numpy array based on the model's feature importances
importance = model.feature_importances_

# Sort the feature labels based on the feature importance rankings from the model
feauture_indexes_by_importance = importance.argsort()

# Print each feature label, from most important to least important (reverse order)
for index in feauture_indexes_by_importance:
    print("{} - {:.2f}%".format(feature_labels[index], (importance[index] * 100.0)))


city_Julieberg - 0.00%
city_Martinezfort - 0.00%
city_New Michele - 0.00%
city_New Robinton - 0.00%
city_Davidtown - 0.07%
city_West Brittanyview - 0.08%
city_Rickytown - 0.09%
city_West Terrence - 0.10%
city_Port Daniel - 0.11%
city_Amystad - 0.12%
city_Fosterberg - 0.12%
city_East Justin - 0.12%
city_Leahview - 0.14%
city_Lake Jennifer - 0.14%
city_Toddshire - 0.14%
city_Clarkberg - 0.18%
city_Jenniferberg - 0.18%
city_West Lydia - 0.19%
city_Wendybury - 0.19%
city_Lake Carolyn - 0.20%
city_West Gerald - 0.20%
city_Joshuafurt - 0.20%
city_Brownport - 0.21%
city_East Lucas - 0.22%
city_Davidfort - 0.27%
city_South Stevenfurt - 0.27%
city_Port Jonathanborough - 0.27%
city_Port Adamtown - 0.29%
city_Scottberg - 0.29%
city_East Janiceville - 0.31%
city_Justinport - 0.34%
city_East Amychester - 0.34%
city_Lake Dariusborough - 0.36%
city_West Gregoryview - 0.37%
city_Richardport - 0.37%
city_Lake Christinaport - 0.37%
city_Morrisport - 0.39%
city_Hallfort - 0.41%
city_North Erinville - 0.5

In [10]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

# Remove the sale price from the feature data
del features_df['sale_price']

# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber'
)
model.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'model/trained_house_classifier_model.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)



Training Set Mean Absolute Error: 48779.4809
Test Set Mean Absolute Error: 58957.8640


In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])
del features_df['sale_price']

X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [12]:
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)

regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=50),n_estimators=70)

regr_1.fit(X_train, y_train)
regr_2.fit(X_train, y_train)

# Predict
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

mse = mean_absolute_error(y_test, y_1)
print("Mean Absolute Error (DT) : %.4f" % mse)

mse = mean_absolute_error(y_test, y_2)
print("Mean Absolute Error (AdaBoost) : %.4f" % mse)

Mean Absolute Error (DT) : 111283.3403
Mean Absolute Error (AdaBoost) : 60062.8151


In [13]:
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit (X_train,y_train)
pred=reg.predict(X_test)
mse = mean_absolute_error(y_test,pred)
print("Mean Absolute Error : %.4f" % mse)

Mean Absolute Error : 96230.1727


In [14]:
from sklearn.linear_model import SGDRegressor

reg = linear_model.SGDRegressor()
reg.fit (X_train,y_train)
pred=reg.predict(X_test)
mse = mean_absolute_error(y_test,pred)
print("Mean Absolute Error : %.4f" % mse)


Mean Absolute Error : 1230961360496583.2500


In [15]:
#Let's find if it underfits...
# Find the error rate on the training set
mse = mean_absolute_error(y_train, reg.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, reg.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 1232390011428614.5000
Test Set Mean Absolute Error: 1230961360496583.2500


In [16]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
reg.fit (X_train,y_train)
pred=reg.predict(X_test)
mse = mean_absolute_error(y_test,pred)
print("Mean Absolute Error : %.4f" % mse)

Mean Absolute Error : 64905.4873


In [17]:
## Naive Bayes

from sklearn.linear_model import BayesianRidge

reg = BayesianRidge()
reg.fit (X_train,y_train)
pred=reg.predict(X_test)
mse = mean_absolute_error(y_test,pred)
print("Mean Absolute Error : %.4f" % mse)

Mean Absolute Error : 96231.9639


In [18]:
from sklearn.svm import SVR

reg = SVR()
reg.fit (X_train,y_train)
pred=reg.predict(X_test)
mse = mean_absolute_error(y_test,pred)
print("Mean Absolute Error : %.4f" % mse)

Mean Absolute Error : 171395.1292


In [None]:
import pandas
from sklearn import ensemble
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import train_test_split

# Load the data set
df = pandas.read_csv("ml_house_data_set.csv")

# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pandas.get_dummies(df, columns=['garage_type', 'city'])
del features_df['sale_price']

X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
param_grid = {
    'n_estimators': [500, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 9],
    'learning_rate': [0.1, 0.05],
    'max_features': [0.3, 0.1],
    'loss': ['ls', 'huber']
}

#param_grid = {
#    'n_estimators': [500, 1000, 3000],
#    'max_depth': [4, 6],
#   'min_samples_leaf': [3, 5, 9, 17],
#    'learning_rate': [0.1, 0.05, 0.02, 0.01],
#    'max_features': [1.0, 0.3, 0.1],
#    'loss': ['ls', 'lad', 'huber']
#}

# Define the grid search we want to run. Run it with four cpus in parallel. -1 all cores
gs_cv = GridSearchCV(model, param_grid, n_jobs=-1)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}

# That is the combination that worked best.

# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)





Other Datasets
Auto MPG : https://archive.ics.uci.edu/ml/datasets/auto+mpg
    
Auto MPG CSV format : https://www.kaggle.com/uciml/autompg-dataset    
        
Bike Sharing : https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
        
Actual Feed: https://www.motivateco.com/use-our-data/
        
CSV Format: https://www.kaggle.com/marklvl/bike-sharing-dataset