## Intro to scikit learn

we are going to cover:

0. An end to end scikit learn workflow
1. getting the data ready
2. choose the right estimator/algorithm for our problem
3. fit the model/algorithm and use it to make predictions on our data
4. Evaluate the model
5. Improve the model
6. Save and load a trained model
7. Putting it all together!

# 0. an end to end scikit learn workflow

In [1]:
#get the data ready
import pandas as pd
import numpy as np

In [2]:
heart_disease = pd.read_csv("data/heart-disease.csv")

FileNotFoundError: [Errno 2] File b'data/heart-disease.csv' does not exist: b'data/heart-disease.csv'

In [None]:
heart_disease

In [None]:
#create X(features matrix)
X = heart_disease.drop("target", axis=1)
#we are dropping the the target column with its data

#create Y(labels)
Y = heart_disease["target"]

In [None]:
X

In [None]:
Y

In [None]:
#choose the right model for the problem and the hyperparameters
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100)
#we'll keep the default hyperparameters

clf.get_params()

In [None]:
#fit the model to the training data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#test_size=0.2 means 20% for testing and 80% for training

In [None]:
#fit it
clf.fit(X_train, Y_train);

In [3]:
#make a prediction
y_preds = clf.predict(X_test)

NameError: name 'clf' is not defined

In [4]:
y_preds

NameError: name 'y_preds' is not defined

In [5]:
Y_test

NameError: name 'Y_test' is not defined

In [6]:
#evaluate the model on the training data and test data

clf.score(X_train, Y_train)

NameError: name 'clf' is not defined

In [None]:
clf.score(X_test, Y_test)

In [7]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(Y_test, y_preds))

NameError: name 'Y_test' is not defined

In [None]:
confusion_matrix(Y_test, y_preds)

In [None]:
accuracy_score(Y_test, y_preds)

In [None]:
# improve the model

#try different n_estimators
np.random.seed(42)

for i in range(10, 100, 10):
    print(f"trying model with {i} estimators")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, Y_train)
    print(f"model accuracy on test set: {clf.score(X_test, Y_test) * 100 : .2f}%")
    print("")

In [None]:
# save the model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))

In [None]:
loaded_model.score(X_test, Y_test)

In [None]:
#new problem
car_Sales = pd.read_csv("data/car-sales-extended.csv")

In [None]:
car_Sales.head()

In [None]:
x = car_Sales.drop("Price", axis=1)
y = car_Sales["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
#Building machine learning model
#using a regrssion model

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
#turning categories into number
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x


In [None]:
#lets refit the model
np.random.seed(420)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

In [None]:
model.fit(x_train, y_train)
model.score(x_test, y_test)

# what if there were missing values hmmmmmmmm

1. fill them with somme values(also known as imputation)
2. or remove the samples with missing data

In [None]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head(10)

In [None]:
len(car_sales_missing)

In [None]:
car_sales_missing.isna().sum()

In [None]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# lets try and convert the data to number
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

## option 1: fill missing data with pandas

In [8]:
#fill make column
car_sales_missing["Make"].fillna("missing", inplace=True)

#fill colour column
car_sales_missing.Colour.fillna("missing", inplace=True)

#fill odometer column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

#fill doors column
car_sales_missing["Doors"].fillna(4, inplace=True)

NameError: name 'car_sales_missing' is not defined

In [None]:
car_sales_missing.isna().sum()

In [9]:
#remove rows with missing prices
car_sales_missing.dropna(inplace=True)

NameError: name 'car_sales_missing' is not defined

In [None]:
car_sales_missing.isna().sum()

In [10]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

NameError: name 'car_sales_missing' is not defined

In [None]:
# lets try and convert the data to number
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

In [11]:
x

NameError: name 'x' is not defined

## option 2: fill missing data with scikit learn

In [12]:
car_mis = pd.read_csv("data/car-sales-extended-missing-data.csv")


FileNotFoundError: [Errno 2] File b'data/car-sales-extended-missing-data.csv' does not exist: b'data/car-sales-extended-missing-data.csv'

In [13]:
car_mis.head()

NameError: name 'car_mis' is not defined

In [None]:
car_mis.isna().sum()

In [14]:
car_mis.dropna(subset=["Price"], inplace=True)
car_mis.isna().sum()

NameError: name 'car_mis' is not defined

In [15]:
#split into x and y
x = car_mis.drop("Price", axis=1)
y = car_mis["Price"]

NameError: name 'car_mis' is not defined

In [None]:
#fill missing values with sci kit learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#fill categorical data with missing and numerical data with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

#define columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

In [16]:
filled_X = imputer.fit_transform(x)

NameError: name 'imputer' is not defined

In [None]:
filled_X

In [17]:
car_sales_filled = pd.DataFrame(filled_X, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

NameError: name 'filled_X' is not defined

In [18]:
car_sales_filled

NameError: name 'car_sales_filled' is not defined

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer



In [19]:
# lets try and convert the data to number
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x

NameError: name 'OneHotEncoder' is not defined

In [20]:
#now we've got data with no missing values and converted to numbers
#lets fit in model
np.random.seed(420)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)


model = RandomForestRegressor(n_estimators=100)
model.fit(x_train, y_train)
model.score(x_test, y_test)

NameError: name 'transformed_x' is not defined

In [None]:
len(car_Sales)

## choosing the right estimator/algo for our problem

* classification - predicting is a sample is one thing or another
* regression - predicting a number

In [None]:
# picking machine learning model for regression

In [21]:
from sklearn.datasets import load_boston

boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [22]:
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [23]:
#how many samples?
len(boston_df)

506

In [24]:
# lets try ridge regression model
from sklearn.linear_model import Ridge

#setup random seed
np.random.seed(420)

#create data
x = boston_df.drop("target", axis=1)
y = boston_df["target"]

#split into train and test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#instantiate model
model = Ridge()
model.fit(x_train, y_train)

#check the score
model.score(x_test, y_test)

0.814556114773719

how do we improve score?

what if this model isnt that good

In [25]:
#lets try random forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9090060829211958

## choosing and estimator for classification

In [26]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

FileNotFoundError: [Errno 2] File b'data/heart-disease.csv' does not exist: b'data/heart-disease.csv'

In [27]:
len(heart_disease)

NameError: name 'heart_disease' is not defined

In [None]:
#trying out linear SVC
from sklearn.svm import LinearSVC

#setup random seed
np.random.seed(69)

#make the data
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [28]:
#splitting the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#instantiate the model
clf = LinearSVC()
clf.fit(x_train, y_train)

#evaluate the model
clf.score(x_test, y_test)

NameError: name 'LinearSVC' is not defined

if not working well using another

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

ValueError: Unknown label type: 'continuous'

## Different names

* `x` = features, feature variables, data
* `y` = labels, targets, target variables

## predict using machine learning models

two ways:

   1. `predict()`
   2. `predict_proba()`


In [30]:
clf.predict(x_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [31]:
#comparing predictions to truth labels to evaluate model
y_preds = clf.predict(x_test)
np.mean(y_preds == y_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [32]:
clf.score(x_test, y_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

In [33]:
#make predictions with predict_proba()
#it returns probabilities of a classification label

clf.predict_proba(x_test[:5])

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
#predictions for a regression model is using mae
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

x = boston_df.drop("target", axis=1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

reg = RandomForestRegressor(n_estimators=100)
reg.fit(x_train, y_train)
reg.score(x_test, y_test)

In [34]:
y_preds = reg.predict(x_test)

NameError: name 'reg' is not defined

In [35]:
np.array(y_test[:10])

array([22.6, 19.6, 18. , 20.6,  7.2,  7.2, 14. , 17.2,  9.6,  8.5])

In [36]:
mean_absolute_error(y_test, y_preds)

NameError: name 'mean_absolute_error' is not defined

## Evaluating a model

Three ways to evaluate a scikit model
1. Estimator `score()` method
2. `scoring` parameter
3. Problem specific metric functions

## 1. Evaluating using `score()` method

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(69)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)

NameError: name 'heart_disease' is not defined

In [38]:
clf.score(x_test, y_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

lets do the same for a regression problem

In [39]:
#predictions for a regression model is using mae
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

x = boston_df.drop("target", axis=1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

reg = RandomForestRegressor(n_estimators=100)
reg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [40]:
model.score(x_test, y_test)

0.9630458943117183

## evaluation using the `scoring` parameter

In [41]:
##for classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(69)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train);

NameError: name 'heart_disease' is not defined

In [None]:
clf.score(x_test, y_test)

In [42]:
cross_val_score(clf, x, y, cv=5)



ValueError: Unknown label type: 'continuous'

In [None]:
np.random.seed(69)

clf_single_score = clf.score(x_test, y_test)
clf_cross_val_score = np.mean(cross_val_score(clf, x, y, cv=5))

clf_single_score, clf_cross_val_score

In [43]:
cross_val_score(clf, x, y, cv=5, scoring=None)



ValueError: Unknown label type: 'continuous'

## classification model evaluation metrics

1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

**Accuracy**

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(69)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=100)
cross_val_score = cross_val_score(clf, x, y, cv=5)

In [44]:
print(f"Heart disease classifier accuracy: {np.mean(cross_val_score) *100 : .2f}%")

TypeError: unsupported operand type(s) for /: 'function' and 'int'

**Area under receiver operating characteristic curve(AUC/ROC)**

* Area under curve
* Receiver operating characteristic

ROC curves are a comparison of a model's true positive rate(tpr) versus false postive rate(fpr).

* True postive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [46]:
from sklearn.metrics import roc_curve

clf.fit(x_train, y_train)
y_probs = clf.predict_proba(x_test)

y_probs[:10]

ValueError: Unknown label type: 'continuous'

In [47]:
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

NameError: name 'y_probs' is not defined

In [None]:
#calculate fpr, tpr and threshold
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

In [48]:
#create plot
import matplotlib.pyplot as plt 

def plot_roc_curve(fpr, tpr):
    """
    Plots the roc curve given the false positive rate(fpr)
    and the true positive rate(tpr).
    """
    plt.plot(fpr, tpr, color="orange", label="ROC")
    #plot line with no predictive power(baseline)
    plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing")
    
    #customize
    plt.xlabel("False positive rate(fpr)")
    plt.ylabel("True positive rate(tpr)")
    plt.title("Receiver operating characteristics (ROC)")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

NameError: name 'fpr' is not defined

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_probs_positive)

**Confusion matrix**

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict. 

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(x_test)
confusion_matrix(y_test, y_preds)

In [49]:
pd.crosstab(y_test, y_preds, rownames=["Actual labels"], colnames=["Predicted labels"])

NameError: name 'y_preds' is not defined

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} seaborn

In [50]:
import seaborn as sea

sea.set(font_scale=1.5)
sea.heatmap(confusion_matrix(y_test, y_preds))

NameError: name 'y_preds' is not defined

In [None]:
def plot_conf_mat(conf_mat):
    """
    Plots seaborn heatmap fpr conf mat
    """
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sea.heatmap(conf_mat, annot=True, cbar=False)
    
    plt.xlabel("True labels")
    plt.ylabel("Predicted labels")
    
    #to fix the annotation bug
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)

plot_conf_mat(confusion_matrix(y_test, y_preds))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

## to summarize classification metrics

* **Accuracy** is a good measure to start with all classes are balanced(same amount of samples which are balanced 0 or 1)
* **Precision** and **recall** become more important when classes are imbalanced
* if false positive predictions are worse than false negative, aim for higher precision
* if false negative predictions are worse than false positive, aim for higher recall
* **F1-score** is a combination of precision and recall

## Randomized search tuning hyperparameters

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

#parameter we can change
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
       "max_depth": [None, 5, 10, 20, 30],
       "max_features": ["auto", "sqrt"],
       "min_samples_split": [2, 4, 6],
       "min_samples_leaf": [1, 2, 4]}

np.random.seed(69)

heart_disease_shuffled = heart_disease.sample(frac=1)


NameError: name 'heart_disease' is not defined

In [52]:
#splitting into x and y
x = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

#setup randomizedsearchcv
rs_clf = RandomizedSearchCV(estimator=clf, param_distributions=grid, n_iter=10 ,#noof models to try
                           cv=5, verbose=2)

rs_clf.fit(x_train, y_train);

NameError: name 'heart_disease_shuffled' is not defined

In [127]:
rs_clf.best_params_

{'n_estimators': 1000,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 5}

In [128]:
rs_y_preds = rs_clf.predict(x_test)

In [129]:
rs_clf.score(x_test, y_test)

0.819672131147541

In [130]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [131]:
def evalute_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs y_preds labels on classification
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    print(f"Accuracy: {accuracy * 100 :.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1: {f1:.2f}")
    

In [132]:
evalute_preds(y_test, rs_y_preds)

Accuracy: 81.97%
Precision: 0.78
Recall: 0.94
F1: 0.85


## PUTTING ALL OF IT TOGETHER

In [134]:
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [135]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

Steps to do:
1. fill missing data
2. convert data into numbers
3. build a model on the data

In [136]:
#getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing  import OneHotEncoder

#modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#setup random seed
import numpy as np
np.random.seed(69)

data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

#define different features and transfomer pipeline
cat_features = ["Make", "Colour"]
cat_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot",OneHotEncoder(handle_unknown="ignore"))
])

door_features = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

#setup preprocessing (fill missing data and converting into numbers)
preprocessor = ColumnTransformer(
                transformers= [
                    ("cat", cat_transformer, cat_features),
                    ("door", door_transformer, door_features),
                    ("num", numeric_transformer, numeric_features)
                ]
)

#create preprocessing and modelling pipeline
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor())
])

x = data.drop("Price", axis=1)
y = data["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model.fit(x_train, y_train)
model.score(x_test, y_test)



0.26495483071670833

In [137]:
#using GridSearchCV
pipe_grid = {"preprocessing__num__imputer__strategy": ["mean", "median"],
           "model__n_estimators": [100, 1000],
           "model__max_depth": [None, 5],
           "model__max_features": ["auto"],
           "model__min_samples_split": [2, 4],
           }
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessing__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessing__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessing__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessing__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessing__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessing__num__imputer__strategy=median, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessing__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__ma

[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=mean, total=   1.9s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=mean 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=mean, total=   2.0s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=median, total=   2.0s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessing__num__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, mod

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

In [139]:
gs_model.best_params_

{'model__max_depth': 5,
 'model__max_features': 'auto',
 'model__min_samples_split': 2,
 'model__n_estimators': 1000,
 'preprocessing__num__imputer__strategy': 'median'}

In [140]:
gs_model.score(x_test, y_test)

0.3275340646897247