In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import numpy as np
import seaborn as sns
from scipy.stats import zscore

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import median_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.metrics import max_error

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from scipy import stats
import time

In [2]:
#leggere dataset dal file csv
data = pd.read_csv('MiningProcess_Flotation_Plant_Database.csv')
dataset = pd.DataFrame(data)
dataset.head()

Unnamed: 0,date,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,2017-03-10 01:00:00,552,1698,301953,557434,395713,100664,174,249214,253235,...,250884,457396,432962,424954,443558,502255,44637,523344,6691,131
1,2017-03-10 01:00:00,552,1698,302441,563965,397383,100672,174,249719,250532,...,248994,451891,42956,432939,448086,496363,445922,498075,6691,131
2,2017-03-10 01:00:00,552,1698,304346,568054,399668,10068,174,249741,247874,...,248071,45124,468927,43461,449688,484411,447826,458567,6691,131
3,2017-03-10 01:00:00,552,1698,304736,568665,397939,100689,174,249917,254487,...,251147,452441,458165,442865,44621,471411,43769,427669,6691,131
4,2017-03-10 01:00:00,552,1698,303369,558167,400254,100697,174,250203,252136,...,248928,452441,4529,450523,45367,462598,443682,425679,6691,131


In [6]:
#suddivisione tra colonne numeriche e categoriche

numerical_columns = [
    'date',
    '% Iron Feed',
    '% Silica Feed',
    'Starch Flow',
    'Amina Flow',
    'Ore Pulp Flow',
    'Ore Pulp pH',
    'Ore Pulp Density',
    'Flotation Column 01 Air Flow',
    'Flotation Column 02 Air Flow',
    'Flotation Column 07 Air Flow',
    'Flotation Column 01 Level',
    'Flotation Column 02 Level',
    'Flotation Column 03 Level',
    'Flotation Column 04 Level',
    'Flotation Column 05 Level',
    'Flotation Column 06 Level',
    'Flotation Column 07 Level',
    '% Iron Concentrate',
    '% Silica Concentrate'
]

In [7]:
#Verifico se nel dataset sono presenti dei valori nulli
print(dataset.isnull().sum()) 
#non ci sono valori nulli

date                            0
% Iron Feed                     0
% Silica Feed                   0
Starch Flow                     0
Amina Flow                      0
Ore Pulp Flow                   0
Ore Pulp pH                     0
Ore Pulp Density                0
Flotation Column 01 Air Flow    0
Flotation Column 02 Air Flow    0
Flotation Column 03 Air Flow    0
Flotation Column 04 Air Flow    0
Flotation Column 05 Air Flow    0
Flotation Column 06 Air Flow    0
Flotation Column 07 Air Flow    0
Flotation Column 01 Level       0
Flotation Column 02 Level       0
Flotation Column 03 Level       0
Flotation Column 04 Level       0
Flotation Column 05 Level       0
Flotation Column 06 Level       0
Flotation Column 07 Level       0
% Iron Concentrate              0
% Silica Concentrate            0
dtype: int64


In [8]:
#Controllo i valori duplicati
dataset.duplicated().sum()

dataset = dataset.drop_duplicates()

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 736282 entries, 0 to 737452
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   date                          736282 non-null  object
 1   % Iron Feed                   736282 non-null  object
 2   % Silica Feed                 736282 non-null  object
 3   Starch Flow                   736282 non-null  object
 4   Amina Flow                    736282 non-null  object
 5   Ore Pulp Flow                 736282 non-null  object
 6   Ore Pulp pH                   736282 non-null  object
 7   Ore Pulp Density              736282 non-null  object
 8   Flotation Column 01 Air Flow  736282 non-null  object
 9   Flotation Column 02 Air Flow  736282 non-null  object
 10  Flotation Column 03 Air Flow  736282 non-null  object
 11  Flotation Column 04 Air Flow  736282 non-null  object
 12  Flotation Column 05 Air Flow  736282 non-null  object
 13 

In [None]:
rf = SelectFromModel(RandomForestRegressor(criterion="absolute_error", random_state=SEED), threshold = 0.03)#threshold = 0.03
rf.fit(X_train, Y_train)

In [None]:
#Restituirà una matrice di valori booleani. 
#Vero per le caratteristiche la cui importanza è maggiore dell'importanza media e Falso per il resto.

rf.get_support()
selected_feat = X_train.columns[(rf.get_support())]
len(selected_feat) #4 features selected
print(selected_feat)

In [None]:
rf.estimator_.feature_importances_
rf.threshold_

In [None]:
importances = rf.estimator_.feature_importances_
# Ordina l'importanza della funzione in ordine decrescente
sorted_indices = np.argsort(importances)[::-1]
feat_labels = X_train.columns

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

In [None]:
#PLOT FEATURE IMPORTANCE
title = 'Feature Importance for Random Forest'
plot_features_importance(sorted_indices, importances, title)

In [None]:
#trasformiamo il dataset solo con le colonne scelte dalla feture selection per 
#RANDOM FOREST REGRESSION
#X_train.info()
selected_x_train = rf.transform(X_train)
X_selected_RF_train = pd.DataFrame(selected_x_train)
#X_selected_RF_train.info()

#y_train.info()
selected_x_test = rf.transform(x_test)
X_selected_RF_test = pd.DataFrame(selected_x_test)
#X_selected_RF_test.info()

In [None]:
#optimization RandomForestRegressor, iperparametri (min_samples_split, min_samples_leaf) con RandomsearchCV
#min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf
# Model to optimize 
reg = RandomForestRegressor(criterion="absolute_error", random_state=SEED)
# Hyperparameters of the model
hpo_space = {
    'min_samples_split': list(range(10, 50)),
    'min_samples_leaf': list(range(10, 50)),
    'max_depth': list(range(10, 15))
}

# Random Search
rand_search = RandomizedSearchCV(
    estimator=reg,
    param_distributions=hpo_space,
    n_iter=200,
    scoring='neg_mean_absolute_error',
    refit=False,
    cv=10,
    random_state=12345)

rand_search.fit(X_selected_RF_train, Y_train)

In [None]:
# Best params
print(f"Best min_sample_split = {rand_search.best_params_['min_samples_split']:.2f}") #min_sample_split = 13
print(f"Best min_sample_leaf = {rand_search.best_params_['min_samples_leaf']:.2f}") #min_sample_leaf = 28
print(f"Best max_depth = {rand_search.best_params_['max_depth']:.2f}")#max_depth = 4
print(f"Best score = {-rand_search.best_score_:.5f}")

'''
Best min_sample_split = 22.00
Best min_sample_leaf = 11.00
Best max_depth = 11.00
Best score = 0.49564


'''

In [None]:
#testiamo il RF con parametri di default

rf = RandomForestRegressor(criterion="absolute_error", random_state=SEED)
time_start = time.time()
rf.fit(X_selected_RF_train, Y_train)
pred = list(rf.predict(X_selected_RF_test))
time_finish = time.time()
evaluate(pred, y_test, 'Random Forest Default')

print("--- %s seconds ---" % (time_finish - time_start))
'''
Random Forest Default:
	- MAE: 0.40000
	- MAPE: 0.09263
	- MSE: 0.40044
	- R2: 0.35905
	- Max error: 2.58000
'''

In [None]:
#testiamo di nuovo le metriche del modello utilizzando i valori degli iperparametri appena trovati
rf = RandomForestRegressor(criterion="absolute_error", min_samples_split=19, min_samples_leaf=44, max_depth=12, random_state=SEED)
time_start = time.time()
rf.fit(X_selected_RF_train, Y_train)
pred = list(rf.predict(X_selected_RF_test))
time_finish = time.time()
evaluate(pred, y_test, 'Random Forest with tuning iperparametri')

print("--- %s seconds ---" % (time_finish - time_start))

'''
Random Forest with tuning iperparametri:
	- MAE: 0.41301
	- MAPE: 0.09221
	- MSE: 0.39022
	- R2: 0.37541
	- Max error: 2.58572
'''