# Model interpretation
Example use case of model interpretation:
- Clustering (find important clusters)
- Classification (build model to predict cluster membership)
- Interpretation of clusters (identify variables associated with cluster membership)

Other model interpretation applications:
- Regression models
- Other classification models

**Dataset used**
- iof_data.csv

In [None]:
# Data links
data_url = {
    'iof_data_1min_csv' : "https://drive.google.com/uc?id=1_jYVXj7mt8Zzpjn8WGI111G-kWRTbfjU",
    'iof_data_1min_parq' : "https://drive.google.com/uc?id=1j5SS136UzbSPu8TqG9RRUMi6-wWF9dzq",
    'mixingTank' :  "https://drive.google.com/uc?id=1b5Qn5LIa6KAE03Tq4yRVdhTyUmZLxRjt",
    'moons' : "https://drive.google.com/uc?id=1a9zTkPEpuHGj6LzGzuLe-JSLg_4GJef4",
    'open_iof_20min' : "https://drive.google.com/uc?id=15lkhdBfWnjlpgpEx4T2XcRApKr-dmBb0",
    'open_iof_cleaned' : "https://drive.google.com/uc?id=1WVbJvYsGy-iKlsW4WaDZrKy_NhK2tJLW",
}

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Clustering
from sklearn.cluster import MiniBatchKMeans
# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Interpretation   
from sklearn.tree import plot_tree       
from sklearn.inspection import permutation_importance, PartialDependenceDisplay   

In [None]:
# Import data set
fname = data_url['iof_data_1min_csv']
date_col = 'date'
df = pd.read_csv(fname,index_col=date_col,parse_dates=[date_col])

In [None]:
# Downsample
df = df.resample('3h').median()
df.dropna(inplace=True)

In [None]:
# Summary statistics
df.describe().T

In [None]:
# Variable of interest: Product silica composition
df["plant.filters.product.silica.comp"].resample('24h').median().plot()

In [None]:
# Prepare data
input_keys = list(df.columns)
input_keys.remove('plant.filters.product.silica.comp')
input_keys.remove('plant.filters.product.iron.comp')
X = df[input_keys].copy()

In [None]:
# Scale input data
sx = StandardScaler()
X = sx.fit_transform(X)

In [None]:
# Dimension reduction of input data
pca = PCA(n_components=2)
T = pca.fit_transform(X)
df['PC1'] = T[:,0]
df['PC2'] = T[:,1]

In [None]:
# Clustering
n_clusters = 3
c = MiniBatchKMeans(n_clusters=n_clusters)
c.fit(T)
df['cluster'] = c.labels_

In [None]:
# Cluster visualization
fig, ax = plt.subplots()
sns.scatterplot(data=df,x='PC1',y='PC2',hue='cluster',ax=ax, palette='viridis')

In [None]:
# Cluster statistics
df.groupby('cluster').mean().T

In [None]:
# Cluster dataframe
Y = df['cluster'].copy()

In [None]:
# Train logistic regression model
lr = LogisticRegression()
lr.fit(X,Y)

In [None]:
# Train decision tree model
t = DecisionTreeClassifier(max_depth=3)
t.fit(X,Y)

In [None]:
# Train random forest model
n_trees = 50
rf = RandomForestClassifier(n_estimators=n_trees)
rf.fit(X,Y)

In [None]:
# Predict output variable 
Ypred_lr = lr.predict(X)
Ypred_t = t.predict(X)
Ypred_rf = rf.predict(X)

In [None]:
# Inspect model performance
# - Accuracy
acc_lr = accuracy_score(Y,Ypred_lr)
acc_t = accuracy_score(Y,Ypred_t)
acc_rf = accuracy_score(Y,Ypred_rf)

print(f'Accuracy of logistic regression: {acc_lr:.4f}')
print(f'Accuracy of tree: {acc_lr:.4f}')
print(f'Accuracy of random forest: {acc_lr:.4f}')


In [None]:
# Logistic regression coefficients
lr_coef = pd.DataFrame(lr.coef_,columns=input_keys).T
lr_coef.index.name = 'Coefficient'
lr_coef

In [None]:
# Decision tree visualization
fig, ax = plt.subplots(figsize=(12,12))
_ = plot_tree(
    t,
    feature_names=input_keys,
    class_names=["low silica","high silica","medium silica"], # cluster labels, in ascending numerical order - UPDATE on each reclustering!
    filled=True,
    fontsize=6
)

In [None]:
# Random forest variable importance
rfVIresult = permutation_importance(rf,X,Y,n_repeats=10)
rfVImean = pd.Series(rfVIresult.importances_mean,index=df[input_keys].columns)
fig, ax = plt.subplots()
rfVImean.plot.bar(yerr=rfVIresult.importances_std,ax=ax)
ax.set_title('Permutation variable importance for random forest')
ax.set_ylabel('Mean accuracy decrease')

In [None]:
# Random forest partial dependence
fig, ax = plt.subplots(figsize=(15,5))
x1 = 'plant.flotation.bank01.column05.pulp.level'
x2 = 'plant.flotation.sump01.amina.flow'
PartialDependenceDisplay.from_estimator(
    estimator = rf,
    X = X,
    target=0, # class label to display partial dependence for - UPDATE on each reclustering!
    features = [x1,[x1,x2]],
    feature_names=input_keys,
    ax=ax
)
fig.suptitle('Random forest: Partial Dependence')