## Kernel: `Fund-d3`

# Day 3 - Tutorial 2: Supervised Learning

The objective of this excercise is to learn how to apply supervised learning to predict an output variable from multi-dimensional data

In [0]:
# Import requiered libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pandas.plotting import scatter_matrix

import dataiku
from dataiku import pandasutils as pdu

<font color="Red" size=4><b>Step 1- Quick Look at the Data:</font></b>

In [0]:
# Load and display the well logs dataset ('w5.csv') and assign 'DEPTH' as the index

mydataset = dataiku.Dataset("w5")
log_data = mydataset.get_dataframe()

log_data = log_data.set_index('DEPTH')

log_data


In [0]:
# Display a summary of the dataset 



In [0]:
# Compute summary statistics of the variables in the dataset



In [0]:
# Compute and draw the histograms for all variables in the dataset

log_data.hist(bins=50, figsize=(8,8), grid=False, color='green', edgecolor = "black", lw=0.1)



In [0]:
# Generate a correlation matrix of all variables in the dataset



In [0]:
# Plot a heat map using the previously generated person coefficient values from the correlation matrix 

sns.heatmap(log_data.corr(), annot=True)

plt.show()

In [0]:
# Describe the correlation coefficients between the variable 'pressure' and the rest of the variables in the dataset

log_data.corr()["Pressure"].sort_values(ascending=False)

In [0]:
# Extract four variables or features and generate a scatter matrix plot

features = ["Pressure", "AI", "VpVs", "phid"]

scatter_matrix(log_data[features], figsize=(10, 10), diagonal = 'hist', 
               hist_kwds={'bins':50, 'color':'green'},
               marker='.', color='green')
plt.show()

<font color="Red" size=4><b>Step 2- Create the Features and Target Dataframes:</font></b>

In [0]:
# Create a new dataframe that only contains the following logs: 'AI','VpVs', and 'phid', call it 'features'

features= log_data[["AI","VpVs","phid"]]

features

In [0]:
# Create a new dataframe that only contains the variable 'pressure' (our target feature), call it 'target'



In [0]:
# Split the 'features' and 'target' dataframes into random train and test subsets [80:20 ratio]

from sklearn.model_selection import train_test_split 

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=1)

# 'test_size=0.2' represents the proportion of datapoints that will be in the test set (20%)

In [0]:
# Let's explore the shape of the 'features_train' and 'features_test' dataframes

print ('features_train', features_train.shape)



<font color="Red" size=4><b>Step 3 - Training and Testing a Random Forest Model</font></b>

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define the random forest model

rf= RandomForestRegressor()


# Fit the model on the train dataset

rf.fit(features_train, target_train)


# Now, we can predict the 'target' data using the trained model

target_pred = rf.predict(features_test)


# Check the accuracy of predicted data by using MSE and RMSE metrics

mse = mean_squared_error(target_test, target_pred)
rmse = np.sqrt(mse)

print ("MSE: ", mse)

print ("RMMSE: ", rmse)


In [0]:
from sklearn.metrics import r2_score

# Compute the r2 score (coefficient of determination) to evaluate the performance of the model

r2_score(target_test, target_pred)


In [0]:
# Create a prediction error plot for the random forest model

from yellowbrick.regressor import prediction_error

visualizer= prediction_error(RandomForestRegressor(),features_train.values,
                             target_train.values,features_test.values,target_test.values,
                            alpha=0.3, size=(400, 400))

# y = actual 'pressure'
# y^ = predicted 'pressure'

<font color="Red" size=4><b>Step 4 - Training and Testing a Support Vector Regression Model</font></b>

In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR

# First, let's standardize the data -> Define the scaler 
sc= StandardScaler()

# Fit the scaler to the dataset
sc.fit(features_train)

# Define the SVR model 
svm = SVR()

# Fit the model on the train dataset. Don't forget to call sc.transform () to perform the standarization
svm.fit(sc.transform(features_train), target_train.values.ravel())

# Now, we can predict the 'target' data using the trained model
target_pred_svm = svm.predict(sc.transform(features_test))


In [0]:
# Compute the r2 score (coefficient of determination) to evaluate the performance of the SVR model

r2_score(target_test, target_pred_svm)

In [0]:
# Create a prediction error plot for the Support Vector Regression (SVR) Model

visualizer=prediction_error(SVR(),sc.transform(features_train),target_train.values.ravel(),
                            sc.transform(features_test),target_test.values,
                           alpha=0.3, size=(500, 400))


<font color="Red" size=4><b>Step 5 - Hyperparameter Tuning Using Grid Search</font></b>

In [0]:
# Define a dictionary with the hyperparameters of the random forest model that we would like to test

# Let's try 6 combinations (3×2) of hyperparameters

param_grid = [{'n_estimators': [3, 10, 30], 
               'max_features': [2, 3]}]


# n_estimators: number of trees in the forest  

# max_features: maximum number of features to consider when looking for the best split

In [0]:
from sklearn.model_selection import GridSearchCV

#Define the random forest regressor model

forest_reg = RandomForestRegressor()


# Now let’s create an object of GridSearchCV

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='r2',return_train_score=True)


# Fit the data into the GridSearchCV object

grid_search.fit(features_train, target_train.values.flatten())


# We can now extract the best parameters after tuning

print(grid_search.best_estimator_)


In [0]:
# Now we can predict the 'target' data using the optimal model

target_opt_pred = grid_search.predict(features_test)



# Extract the score of the best estimator model on the testing data

r2_score(target_test, target_opt_pred)


In [0]:
# Create and display a dataframe with the results from the hyperparameter tuning (grid search)

pd.DataFrame(grid_search.cv_results_)

# Note that we have 6 rows: 3 choices for 'n_estimators' times 2 choices for 'max_features'

<font color="Red" size=4><b>Step 6 - Hyperparameter Tuning Using Randomnized Search</font></b>

In [0]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


# Generate a dictionary with the hyperparameters of the random forest model that we would like to test

param_distribs = {'n_estimators': randint(1,200),
                    'max_features': randint(1, 3)}


# Define the random forest regressor model
forest_reg = RandomForestRegressor(random_state=42)

# Note that random_state sets a seed (42) to ensure that the splits are reproducible


# Create an object of RandomizedSearchCV
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='r2', random_state=42)

# Fit the data into the RandomizedSearchCV object
rnd_search.fit(features_train, target_train.values.flatten())

# Extract the best parameters after tuning
print(rnd_search.best_estimator_)


In [0]:
# Now we can predict the 'target' data using the optimal model
target_opt_pred_rnd = rnd_search.predict(features_test)


# Extract the score of the best estimator model on the testing data
r2_score(target_test, target_opt_pred_rnd)


In [0]:
# Create and display a dataframe with the results from the hyperparameter tuning (randomnized search)

pd.DataFrame(rnd_search.cv_results_)


<font color="Red" size=4><b>Step 7 - Set Up a Machine Learning Pipeline</font></b>

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Define the following steps in the pipeline: deal mith NaN, standardize, model

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
    ('model',SVR())
])

## The pipeline can be used as any other estimator

# First, fit the pipeline to the dataset
pipe.fit(features_train, target_train)


# Now, we can predict the 'target' using the trained model
pred_pipe_svr = pipe.predict(features_test)


# Compute the r2 score to evaluate the performance of the model
r2_score(target_test, pred_pipe_svr)


<font color="Red" size=4><b>Step 8 - Set Up a ML Pipeline with Grid Search </font></b>

In [0]:
# Let's start by checking the list of available parameters

pipe.get_params().keys()

pipe.get_params()

In [0]:
# Define a dictionary with the parameters from the SVR model that we would like to test

param_grid = [
    {'model__C': [1, 3, 10], 
     'model__epsilon': [.1, .5, .8, 1],
     'scaler':[StandardScaler(),MinMaxScaler()]},
]

# in SVR, C is a regularisation parameter and epsilon defines a margin of tolerance (error sensitivity)

In [0]:
# Now let’s create an object of GridSearchCV

grid_search_pipe = GridSearchCV(pipe, param_grid, cv=5,
                                scoring='r2',return_train_score=True)


# Fit the data into the GridSearchCV object

grid_search_pipe.fit(features_train, target_train)


# We can now extract the best parameters after tuning

print(grid_search_pipe.best_estimator_)


In [0]:
# Now, let's predict the 'target' using the best pipeline model

pred_ = grid_search_pipe.predict(features_test)


# Compute the r2 score to evaluate the performance of the pipeline model

r2_score(target_test, pred_)