## Kernel `Fund-d4`

# Day 4 - Tutorial 1: Predicting Production Data

The objective of this excercise is to learn how to apply supervised learning to predict an output variable from multi-dimensional data

In [0]:
# Import required libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import dataiku
from dataiku import pandasutils as pdu


## Step 1: Data Peparation

In [0]:
# Read the well dataset 

mydataset = dataiku.Dataset("well_data_raw")
well_data = mydataset.get_dataframe()

In [0]:
# Explore the content of the well_data dataframe



In [0]:
# Create a scatter plot to display the well location (lat & long), total depth (tvd), and cum gas production

well_data.plot(kind='scatter', x="longitude", y="latitude", 
        figsize=(7, 5),
        s=well_data["cum_12_gas_prod"] / 70, c="tvd",
        cmap=plt.get_cmap("jet"), 
        alpha=.4, linewidth=2,
        vmin=3100, vmax=3900)


In [0]:
# Display and analyze the distribution of missing values in the dataset

import missingno as msno

msno.matrix(well_data)

In [0]:
# Another plot to visualize the total number of missing values per column is the bar plot. Let's display it

msno.bar(well_data)

In [0]:
# Create a copy of the dataset (well_data), call it 'well_data_clean' and perform the following operations:

# Set 'well_id' as the index 
# Remove the rows where 'fluid'= 'oil'
# Remove missing values
# Remove the column 'fluid'
# Add a new column 'target_bin' and define 5 bins for the 'cum_gas_prod' ["a","b", "c","d", "e"]
# Make sure to asign the data type in the 'target_bin' column as object

well_data_clean = (well_data
        .copy(deep=True)
        .set_index("well_id")
        .pipe(lambda well_data_: well_data_[~well_data_.fluid.str.contains("Oil", na=True)])
        .dropna()
        .drop(["fluid"], axis=1)
        .assign(target_bin=lambda well_data_: pd.qcut(well_data_.cum_12_gas_prod, 5, labels= ["a","b", "c","d", "e"]))
        .assign(target_bin=lambda well_data_: well_data_.target_bin.astype(object))
               
)


# Let's print the new dataset



In [0]:
# Let's print the count of values in each bin ("target_bin")



## Step 2: Data Standardization

In [0]:
# First create a subset of the 'well_data_clean' dataframe to include only the numerical data

slice_2_scale= well_data_clean[well_data_clean.select_dtypes(include='number').columns]


# Standardize the dataset and add back the non-numerical column ('target_bin')

well_data_clean[well_data_clean.select_dtypes(include='number').columns]= StandardScaler().fit_transform(slice_2_scale)

In [0]:
# Display the standardized dataset 'well_data_clean'



## Step 3: Reshape the DataFrame

In [0]:
# Change the DataFrame format from wide to long using the .melt () function 

well_data_melt = pd.melt(well_data_clean, value_vars= list(well_data_clean.columns).remove("target_bin"), id_vars ='target_bin')


# Visualize the melted DataFrame. Analyze the content of rows and columns



## Step 4: Statistical Graphics Plotting

In [0]:
import seaborn as sns

# Create a violin plot to visualize the distribution of all variables in the dataset ('well_data_melt')

plt.figure(figsize= (15, 6))
plt.xticks(rotation= 90)
sns.set(font_scale=1)
sns.set(style="whitegrid")
sns.violinplot(data= well_data_melt, x="variable", y="value")


In [0]:
# Create a swarmplot and color the points according to the 'target_bin' variable

plt.figure(figsize=(35, 15))
plt.xticks(rotation=90)
sns.set(font_scale = 1)
sns.set(style="whitegrid")
sns.swarmplot(data=well_data_melt, x="variable",y="value",size=5, hue="target_bin")

## Step 5:  Dimensionality Reduction

In [0]:
# Define a list of variables to be used as features

features = ['fluid_per_stimulated_length', 'proppant_fluid_ratio',
           'proppant_per_stage', 'proppant_per_stimulated_length', 'stage_length',
           'stimulated_length', 'latitude', 'longitude', 'tvd','month_on_production']


# Define the gas production as the target variable 'cum_12_gas_prod', call it 'target'



In [0]:
import umap

# UMAP= Uniform Manifold Approximation and Projection

# First, define the umap reducer

reducer = umap.UMAP(random_state=42)


# Train the reducer to learn from the features

embedding_umap= reducer.fit_transform(StandardScaler().fit_transform(well_data_clean[features]))


In [0]:
# Now let's display the 2-dimensional representation of the data using Umap

plt.figure(figsize=(15, 8))
plt.subplot(121)
plt.scatter(embedding_umap[:, 0], embedding_umap[:, 1], 
            marker="o", s=100, edgecolor="k",
            c=well_data_clean[target], 
            cmap="prism", alpha=1)

plt.title("UMAP", fontsize="large")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")


In [0]:
# Below are additional tools for visualizing high-dimensional data:

#from sklearn.manifold import TSNE
#from phate import PHATE

#embedding_tsne = TSNE().fit_transform(StandardScaler().fit_transform(well_data_clean[features]))

#embedding_phate = PHATE(random_state=42).fit_transform(StandardScaler().fit_transform(well_data_clean[features]))

## Step 6:  Cross Validation for ML Models

Cross-validation is a statistical method used for assessing the effectiveness of machine learning models

In [0]:
from sklearn.model_selection import train_test_split

# Split the full dataset ('well_data') into two parts [30: 70] [test:train]

features_train,features_test,target_train,target_test=train_test_split(well_data[features],
                                               well_data[target],
                                               test_size=0.3,
                                               random_state=42)

In [0]:
from lightgbm import LGBMRegressor
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import KFold

# Create a cross-validation strategy. Define the number of folds (k=5) to split the data

cv = KFold(n_splits=5)


# Define the classification model using a LightGBM Classifierand visualizer

class_model= LGBMRegressor()

visualizer = CVScores(class_model, cv=cv, scoring='r2',size=(500,300))


# Fit the data to the visualizer

visualizer.fit(well_data_clean[features], well_data_clean[target])

visualizer.show()

In [0]:
from yellowbrick.regressor import PredictionError

# Generate a prediction error plot to evaluate the LGBM model

'LGMB : Light Gradient Boosting Machine'

visualizer = PredictionError(LGBMRegressor(), size=(500,500))
visualizer.fit(features_train, target_train)
visualizer.score(features_test, target_test)
visualizer.show()

'y = Actual Target Value'
'ŷ = Predicted Target value'

In [0]:
from xgboost import XGBRegressor

# Now let's generate the prediction error plot for the XGBoost model

'XGBoost: Extreme Gradient Boosting'

visualizer = PredictionError(XGBRegressor(), size=(500,500))
visualizer.fit(features_train, target_train)
visualizer.score(features_test, target_test)
visualizer.show()


'y = Actual Target Value'
'ŷ = Predicted Target value'

In [0]:
from yellowbrick.regressor import ResidualsPlot

# Now let's create a residual plot for the LGBM model

visualizer = ResidualsPlot(LGBMRegressor(), size=(600,400))
visualizer.fit(features_train, target_train)
visualizer.score(features_test, target_test)
visualizer.show()

In [0]:
from yellowbrick.model_selection import LearningCurve

# Create an evenly spaced array that will be used as 

sizes = np.linspace(start=0.1, stop=1.0, num=50)


# Create the learning curve visualizer

lc_viz = LearningCurve(LGBMRegressor(), train_sizes=sizes, 
                    scoring='r2',cv=cv,size=(500,500))


# Fit the learning curve to our well data 

lc_viz.fit(well_data[features], well_data[target])
lc_viz.show()


## Step 7: Feature Importances

The feature engineering process involves selecting the minimum required features to produce a valid model

In [0]:
from yellowbrick.model_selection.importances import FeatureImportances
from lightgbm import LGBMRegressor

# Title case the features for a better display

labels = list(map(lambda s: s.title(), features))


# Define the visualizer

visualizer = FeatureImportances(LGBMRegressor(), labels=labels, 
                         relative=False, size=(500,500),cv=cv)

# Fit and show the feature importances plot

visualizer.fit(features_train, target_train)

visualizer.show()

In [0]:
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor

'RFECV: Recursive feature elimination with cross-validation '

# First, split the clean dataset ('well_data_clean') into two parts [30: 70] [test:train]

features_train,features_test,target_train,target_test=train_test_split(well_data_clean[features],well_data_clean[target],test_size=0.3,random_state=42)


# Define the estimator model using XGB

model= XGBRegressor()


# Create a cross-validation strategy

cv = KFold(n_splits=3)


# Define the parameters in the RFE model

rfecv= RFECV(estimator= model, step=1, cv=cv, scoring= 'r2', min_features_to_select=1)


# Fit the data to the RFE model

rfecv.fit(features_train, target_train)
         

# Examine the number of features selected 
print("Optimal number of features: ",  rfecv.n_features_)


# Examine the features selected 
print("Selected features: ",  rfecv.support_)


In [0]:
# Examine the cross validation scores as more features are selected in the model (remember that we used step=1)

rfecv.grid_scores_

In [0]:
# Let's plot the number of features versus the cross-validation scores

import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=15, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=10)
plt.ylabel('Cross validation score (r2)', fontsize=14, labelpad=10)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, linewidth=2)
plt.show()


In [0]:
# Now let's use the output of the RFE model and display the features importances


# Create a Dataframe including the optimal features based on RFE 

dset = pd.DataFrame()
dset['feature'] = features_train.columns[rfecv.support_]
dset['importance'] = rfecv.estimator_.feature_importances_
dset = dset.sort_values(by='importance', ascending=False)


# Plot feature importance

plt.figure(figsize=(8, 8))
plt.barh(y=dset['feature'], width=dset['importance'])
plt.title('RFECV - Feature Importances', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Importance', fontsize=16, labelpad=10)
plt.show()

## Step 8: Use SHAP Values to Explain How a ML Model Works

SHAP: SHapley Additive exPlanations

SHAP values interpret the impact of having a certain value for a given feature in comparison to the prediction we'd make if that feature took some baseline value

Positive SHAP value means positive impact on prediction and negative SHAP value means negative impact

In [0]:
import shap

# Train a XGBoost model

reg= XGBRegressor().fit((well_data[features]), (well_data[target]))


# Compute SHAP values

explainer = shap.Explainer(reg)
shap_values = explainer((well_data[features]))


In [0]:
# Generate a beeswarm plot

shap.plots.beeswarm(shap_values)


# Note that features are also ordered by their effect on prediction
# Each point represents a row from the dataset
# The colors represent the feature values, not to be confuseds with the shap values. If the value of a feture is high -> pink

In [0]:
# Generate a Waterfall plot for the observation # 200

i = 200
shap.waterfall_plot(shap_values[i])

# This plot shows the effect that each feature has on the prediction, for a given observation