In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# Sklearn end to end workflow,
 #1. Getting the data ready
 #2. Choose the right estimator/algorithm for our problems
 #3. Fit the model/algorithm and use it to take predictions on data
 #4. Evaluating a model
 #5. Improve a model
 #6. Save and load a trained model

# 1. Getting our data ready to be used with machine learning 
    Three main things we have to doL
        1. Split the data into features and labels (usually 'x' & 'y')
        2. Filling (called imputing) or disregarding missing values
        3. Converting non-numerical values to numerical values (also called feature encoding)

### 1.1 Make sure it is all numerical

In [None]:
car_sales = pd.read_csv('../input/scikitconf/car-sales-extended.csv')
car_sales.head()

In [None]:
car_sales_missinging = pd.read_csv('../input/scikitconf/car-sales-extended-missing-data.csv')
car_sales_missing.head()

In [None]:
from sklearn.model_selection import train_test_split

# Split data into x,y
X = car_sales.drop('Price',axis=1)
y = car_sales['Price']

#Split into training and test

X_train, X_test, y_train, y_train = train_test_split(X,y,test_size=0.2)

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
#One Hot Encoding helps transform the categorital features to numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
pd.DataFrame(transformed_X)

In [None]:
dummies = pd.get_dummies(car_sales[['Make','Colour','Doors']])
dummies

In [None]:
# Fit the model

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,y, test_size=0.2)
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
#What if there were missing values?
   # 1. Fill them with some values(know as imputation).
   # 2. Remove the samples with mssing data altogether.
    
car_sales_missing.isna().sum()

In [None]:
#Create X and Y
X = car_sales_missing.drop('Price',axis=1)
y = car_sales_missing['Price']

## Option 1: Fill missing data with Pandas


In [None]:
#Fill the 'Make' Column

car_sales_missing['Make'].fillna('missing',inplace=True)

# Fill the 'Colour' column

car_sales_missing['Colour'].fillna('missing',inplace=True)

#Fill the 'Odometer' column
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(),inplace=True)

#Fill the door column

car_sales_missing['Doors'].fillna(4,inplace=True)

In [None]:
#Check out dataframe after imputation
car_sales_missing.isna().sum()

In [None]:
#Drop rows missing pricing values
car_sales_missing.dropna(inplace=True)

In [None]:
len(car_sales_missing)

In [None]:
X = car_sales_missing.drop('Price',axis=1)
y = car_sales_missing['Price']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

### Option 2: Fill missing values with Scikit learn



In [None]:
car_sales_missing = pd.read_csv('../input/scikitconf/car-sales-extended-missing-data.csv')
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=['Price'],inplace=True)
car_sales_missing.isna().sum()

In [None]:
# Split into X & Y
X = car_sales_missing.drop('Price',axis=1)
y = car_sales_missing['Price']

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Fill categorical values with missing & Numerical values with mean

cat_imputer = SimpleImputer(strategy = 'constant',fill_value = ',missing')
door_imputer = SimpleImputer(strategy='constant',fill_value=4)
num_imputer = SimpleImputer(strategy='mean')

#Define columns
cat_features = ['Make','Colour']
door_features = ['Doors']
num_features = ['Odometer (KM)']

#Create an imputer (something that fills missing data)

imputer = ColumnTransformer([
    ('cat_imputer',cat_imputer,cat_features),
    ('door_imputer',door_imputer,door_features),
    ('num_imputer', num_imputer,num_features)
])

# Transform the data

filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X,columns=['Make','Colour','Doors','Odometer (KM)'])

In [None]:
car_sales_filled.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

In [None]:
#Use filled data fit the model

np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X,y, test_size = 0.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train,y_train)
model.score(X_test,y_test)

## 2. Choosing the right estimator/algorithm for problem

    Scikit-learn uses estimartor as another term for machine  learning model or algorithm
    * Classification - prediction whether a sample is one or another
    * Regression - predicting a number
    
    Step 1 - check the Scikit-Learn machine learning map...

### 2.1 Picking a machine learning model for a regression model

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [None]:
boston_df = pd.DataFrame(boston['data'],columns = boston['feature_names'])
boston_df['target'] = pd.Series(boston['target'])
boston_df.head()

In [None]:
#Let's try the Ridge regression model

from sklearn.linear_model import Ridge

#Setup random seed
np.random.seed(42)

#Create Data
X = boston_df.drop('target',axis=1)
y= boston_df['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

#Instantiate Ridge Model

model = Ridge()
model.fit(X_train,y_train)

#Check the score of the Ridge model on test data
model.score(X_test,y_test)

# How to imporve the score
# What if Ridge was not working?

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X= boston_df.drop('target',axis=1)
y = boston_df['target']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)

rf.score(X_test,y_test)

In [None]:
#Check the Ridge model again

model.score(X_test,y_test)

### 2.2 Choosing and estimator for a classification problem

In [None]:
heart_disease = pd.read_csv('../input/scikitconf/heart-disease.csv')
heart_disease.head()

### Consulting the map and it shows to try LinearSVC

In [None]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

#Setup random seed
np.random.seed(42)

#Make the data
X = heart_disease.drop('target',axis=1)
y = heart_disease['target']

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate LinearSVC

clf = LinearSVC(max_iter=100000)
clf.fit(X_train,y_train)

#Evaluate the LienarSVC

clf.score(X_test,y_test)

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

#Setup random seed
np.random.seed(42)

#Make the data
X = heart_disease.drop('target',axis=1)
y = heart_disease['target']

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate Random Forest Classifier

rfc = RandomForestClassifier(n_estimators=100)

#Fit the model to the data
rfc.fit(X_train,y_train)

#Evaluate the RandomForestClassifier

rfc.score(X_test,y_test)

## 3. Fit the model/algorithm on our data and use it to make predictions

###      3.1 Fitting the model to the data

Different name for:

* 'X' = features, features variables, data

* 'y' = labels, targets, target variables

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

#Setup random seed
np.random.seed(42)

#Make the data
X = heart_disease.drop('target',axis=1)
y = heart_disease['target']

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate Random Forest Classifier

rfc = RandomForestClassifier(n_estimators=100)

#Fit the model to the data
rfc.fit(X_train,y_train)

#Evaluate the Random Forest Classifier (use the patterns the model has learned)

rfc.score(X_test,y_test)

### 3.2 Make predictions using a machine learning model

    2 ways to make predictions:
        1. predict()
        2. predict_proba()



In [None]:
# Use a trained model to predictions
# Compare predictions to true labels to evaluate the model (3 ways to evaluate the result)

y_preds = clf.predict(X_test)

np.mean(y_preds==y_test)

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_preds)

### Make predictions with predict_proba()


In [None]:
# predict_proba() returns probabilities of a classificaiton label

rfc.predict_proba(X_test[:5])

In [None]:
rfc.predict(X_test[:5])

In [None]:
X_test[:5]

In [None]:
heart_disease['target'].value_counts()

### 'predict()' can be also be used for regression model.

In [None]:
len(boston_df)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

#Create the data

X = boston_df.drop('target', axis=1)

y = boston_df['target']

#Split the data

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

#Instantiate and fit model
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

#Make predictions
y_preds = model.predict(X_test)

In [None]:
y_preds[:10]

In [None]:
np.array(y_test[:10])

In [None]:
#Compare the predictions to the truth

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_preds)

## 4. Evaluating a machine learning model

Three ways to evaluate Scikit-Learn models/estimators:
1. Estimator 'score method
2. The 'scoring' parameter
3. Problem-specific metric functions.


### 4.1 Evaluate a model with the score method



In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

X = heart_disease.drop('target',axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#Random Classifier use mean accuracy to measure
rfc.score(X_test, y_test)

In [None]:
### let's do the same but for the regression
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

#Create the data

X = boston_df.drop('target', axis=1)

y = boston_df['target']

#Split the data

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

#Instantiate and fit model
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)



In [None]:
#Score funtion is different in every model. Regressor use coeffication determanation

rfr.score(X_test, y_test)

In [None]:
### 4.2 Evaluating a model using the scoring parameter

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train);

In [None]:
clf.score(X_test,y_test)

In [None]:
cross_val_score(clf,X,y,cv=5)

In [None]:
np.random.seed(42)

#Single training and test split score
clf_single_score = clf.score(X_test,y_test)

#Take the mean of 5-fold cv score
clf_cross_val_score = np.mean(cross_val_score(clf,X,y,cv=5))

#compare the two

clf_single_score,  clf_cross_val_score


In [None]:
#Default scoring parameter of classifier = mean accuracy

In [None]:
#Scoring parameter set to None by default
cross_val_score(clf,X,y,cv=5,scoring=None)

### 4.2.1 Classification model evaluation metrics

1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report


Accuracy

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target",axis=1)
y = heart_disease['target']

clf = RandomForestClassifier()
cross_val_score = cross_val_score(clf,X,y,cv=5)

In [None]:
np.mean(cross_val_score)

In [None]:
print(f"Heart Disease Classifier Cross-Validaiton Accuracy: {np.mean(cross_val_score)*100:.2f}%")

**Area under the receiver operating characteristic curve(AUC/ROC)

*Area under curve (AUC)


*ROC curve


ROC curvers are comparision of a model's true postive rate(tpr) veruss a models false positive rate (fpr).

*True positive = model predicts 1 when truth is 1

*False positive = model predicts 1 when truth is 0

*True negative = model predicts 0 when truth is 0

*False negative = model predicts 0 when truth is 1


In [None]:
#Create X_test... etc

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

#Fit the classifier
clf.fit(X_train,y_train)

#Mame predictions with probabilities
y_probs = clf.predict_proba(X_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:,1]

y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr and thresholds

fpr, tpr, thresholds = roc_curve(y_test,y_probs_positive)

#check the false positive rates

fpr

In [None]:
#Create a function for plotting ROC curves

import matplotlib.pyplot as plt

def plot_roc_curve(fpr,tpr):
    """
    Plots a ROC curve given the false positive rate (fpr)
    and true positive rate (tpr) of a model.
    """
    # Plot roc curve
    plt.plot(fpr,tpr,color='orange',label = 'ROC')
    #Plot line with no predictive power baseline
    
    plt.plot([0,1],[0,1],color = 'darkblue',linestyle = '--',label = 'Guessing')
    
    plt.xlabel('False Positive rate (fpr)')
    plt.ylabel('True Positive rate (tpr)')
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr,tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score 
fpr, tpr, thresholds = roc_curve(y_test,y_test)
plot_roc_curve(fpr,tpr)

In [None]:
# Perfect AUC score

roc_auc_score(y_test,y_test)

## Confusion Matrix

A confusion matrix is a quick way to compare the labels a model predicts and the actual label it was supposed to predict.

In essence,giving you an idea of where the model is getting confused.


In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test,y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()

pd.crosstab(y_test,
           y_preds,
           rownames = ['Actural Labels'],
           colnames=['Predicted Labels'])

In [None]:
#Make our confusion matrix more visual with Seaborn heatmao()
import seaborn as sns

#Set the font scale
sns.set(font_scale = 1.5)

#Create a confusion matrix

conf_mat = confusion_matrix(y_test,y_preds)

#plot it using seaborn

sns.heatmap(conf_mat)



In [None]:
def plot_conf_mat(conf_mat):
    """
    Plt a confusion matrix using seaborn's heatmap()
    
    """
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(conf_mat,
                    annot=True,#Annotrate the boxes
                    cbar = False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label');
    
plot_conf_mat(conf_mat)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf,X,y)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_preds))

In [None]:
# Where precision and recall become valuable

disease_true = np.zeros(10000)
disease_true[0] = 1#Only one positive

disease_preds = np.zeros(10000) #model predicts every case as 0

pd.DataFrame(classification_report(disease_true,disease_preds,output_dict=True))

To summarize classification metrics:
* Accuracy is a good measure to start with if all classes are balanced(e.g. same amount of samples which are labelled with 0 or 1)

* Precision and Recall become more import when classes are imbalanced.
* If false positive predictions are worse than false negatives, aim for higher precision.
* F1-score is a combination of precision and recall.

###4.2.2 Regression model evluation metrics

* Model evaluation https://scikit-learn.org/stable/modules/model_evaluation.html

1. R^2 or called Coefficient of determination.
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

**R^2**

What R-squared does: Compares your models prediction to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, it's R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1. 


In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = boston_df.drop("target",axis=1)
y = boston_df['target']

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train,y_train);

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import r2_score

#Fill an array with y_test mean

y_test_mean = np.full(len(y_test),y_test.mean())

In [None]:
y_test_mean

In [None]:
r2_score(y_test,y_test_mean)

In [None]:
r2_score(y_test,y_test)

**Mean absolute error (MEA)**

MAE is the average of the absolute differences between predictions and actural values. It gives you an idea of how wrong your models predictions are.

In [None]:
# Mean absolute error

from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test,y_preds)
mae

In [None]:
df = pd.DataFrame(data = {"actural values":y_test,
                          "predicted values":y_preds})

df["differences"] = df['predicted values'] - df['actural values']
mean_absolute_error = df["differences"].abs().mean()

**Mean squared error**

In [None]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(X_test)
mse = mean_squared_error(y_test,y_preds)
mse

In [None]:
#Calculate MSE by hand

Squared = df['differences']**2
Mean_Squared_Error = Squared.mean()
Mean_Squared_Error

### 4.23 Finally using the scoring parameter



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop('target',axis=1)
y = heart_disease['target']

clf = RandomForestClassifier(n_estimators=100)


In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, cv=5,scoring=None)
cv_acc

In [None]:
#Cross_validated accuracy

print(f'The cross-validate accuracy is: {np.mean(cv_acc)*100:.2f}%')

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, cv=5,scoring='accuracy')
print(f'The cross-validate accuracy is: {np.mean(cv_acc)*100:.2f}%')

In [None]:
# Precision

cv_precision = cross_val_score(clf,X,y,cv=5,scoring='precision')
np.mean(cv_precision)

In [None]:
# Recall

cv_recall = cross_val_score(clf,X,y,cv=5, scoring = 'recall')
np.mean(cv_recall)

In [None]:
cv_f1 = cross_val_score(clf,X,y,cv=5,scoring='f1')
np.mean(cv_f1)

### How about our regression model?

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = boston_df.drop("target", axis=1)
y = boston_df['target']

model = RandomForestRegressor(n_estimators=100)

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model,X,y, cv=5,scoring=None)
cv_r2

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model,X,y, cv=5,scoring='r2')
cv_r2

In [None]:
# Mean absolute error
cv_mae = cross_val_score(model, X, y, cv = 5, scoring = 'neg_mean_absolute_error')
cv_mae

In [None]:
# Mean squared error
cv_mse = cross_val_score(model, X,y, cv=5,scoring = 'neg_mean_squared_error')
np.mean(cv_mse)