In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pickle

In [2]:
# X, y = datasets.load_iris(return_X_y=True, as_frame=True)
X, y = datasets.fetch_california_housing(return_X_y=True, as_frame=True)

In [3]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## Let's train a simple SupportVectorMachine model to classify the iris species

In [4]:
# Divide the dataset into training and test sets and then train a model with default hyperparameters.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
y_train.head()

14196    1.030
8267     3.821
17445    1.726
14265    0.934
2271     0.965
Name: MedHouseVal, dtype: float64

In [6]:
# Train the model.
model = Ridge(random_state=42)
model.fit(X_train, y_train)

Ridge(random_state=42)

In [7]:
# Let's check the R2 score.
print(model.score(X_test, y_test)) # The R2 score is bpretty good.

0.5758549611440126


### An R2 score of 0.57 is good. Let's deploy the app to Render.

Let's save the trained the model to disk and push it to github.

In [11]:
filename = "best_model.pickle"
# save model
pickle.dump(model, open(filename, "wb"))
# The model can be loaded as follows. This has been done in 'app.py'.
# loaded_model = pickle.load(open(filename, "rb"))

Follow the steps below to set up a Github repository for this example project.
1. Create an account on Github (if you do not have one).
2. Create a new repository that will store the files on Github. Follow the instructions here to create a new repository.

### Now let's compare the performance of the LinearRegression with a DecisionTree model.

In [71]:
# This is just a test block. Do not put this in the final block.
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
print(dt.score(X_test, y_test))

0.622075845135081


Given the configuration hyperparameters (random_states), the DecisionTree model seems to perform better than a simple LinearRegression model.
NOTE: Although this is a simple comparison between two models, ideally we want to have a comprehensive analysis. For our example, we will use this simple comparison.

Let's deploy the DecisionTree model as a web app so that we can make predictions.

We are going to save the model so that we can deploy it later on Render. A python script called 'app.py' will load the saved model and then provide a simple interface to tweak the values of the input features.

In [77]:
print(dt.get_params())

{'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}


In [68]:
# Save the model to disk.
import pickle

In [79]:
filename = "best_model.pickle"
# save model
pickle.dump(dt, open(filename, "wb"))
# save the best model configuration.
np.savez_compressed("best_model_params.npz", dt.get_params())

# The model can be loaded as follows. This has been done in 'app.py'.
# loaded_model = pickle.load(open(filename, "rb"))

NOTE: Make sure to add the newly saved model to the git commit.

# Exercise: Are there better models?
Go through the scikit-learn documentation and try training other models and see if you can get better performance than the decision tree?

In [81]:
# Solution
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

0.8051230593157366


Save the best model and push it to GitHub.