In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report

In [2]:
minData = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Projects/MiningProcess_Flotation_Plant_Database.csv")

In [3]:
minData2 = minData.drop(columns='date', axis=1)

In [4]:
for col in minData2.columns:
  if minData2[col].apply(lambda x: isinstance(x, str)).any():
    print(f"Column '{col}' contains string values.")
    minData2[col] = pd.to_numeric(minData2[col].str.replace(',', '.').str.replace(' ', ''), errors='coerce')

Column '% Iron Feed' contains string values.
Column '% Silica Feed' contains string values.
Column 'Starch Flow' contains string values.
Column 'Amina Flow' contains string values.
Column 'Ore Pulp Flow' contains string values.
Column 'Ore Pulp pH' contains string values.
Column 'Ore Pulp Density' contains string values.
Column 'Flotation Column 01 Air Flow' contains string values.
Column 'Flotation Column 02 Air Flow' contains string values.
Column 'Flotation Column 03 Air Flow' contains string values.
Column 'Flotation Column 04 Air Flow' contains string values.
Column 'Flotation Column 05 Air Flow' contains string values.
Column 'Flotation Column 06 Air Flow' contains string values.
Column 'Flotation Column 07 Air Flow' contains string values.
Column 'Flotation Column 01 Level' contains string values.
Column 'Flotation Column 02 Level' contains string values.
Column 'Flotation Column 03 Level' contains string values.
Column 'Flotation Column 04 Level' contains string values.
Column 

In [5]:
features = minData2.drop(columns='% Silica Concentrate')
target = minData2['% Silica Concentrate']

In [6]:
minData2.columns

Index(['% Iron Feed', '% Silica Feed', 'Starch Flow', 'Amina Flow',
       'Ore Pulp Flow', 'Ore Pulp pH', 'Ore Pulp Density',
       'Flotation Column 01 Air Flow', 'Flotation Column 02 Air Flow',
       'Flotation Column 03 Air Flow', 'Flotation Column 04 Air Flow',
       'Flotation Column 05 Air Flow', 'Flotation Column 06 Air Flow',
       'Flotation Column 07 Air Flow', 'Flotation Column 01 Level',
       'Flotation Column 02 Level', 'Flotation Column 03 Level',
       'Flotation Column 04 Level', 'Flotation Column 05 Level',
       'Flotation Column 06 Level', 'Flotation Column 07 Level',
       '% Iron Concentrate', '% Silica Concentrate'],
      dtype='object')

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 0.2, random_state=36)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
scaler.fit(xtrain)
xtrainStd = scaler.transform(xtrain)
scaler.fit(xtest)
xtestStd = scaler.transform(xtest)

In [10]:
from sklearn.model_selection import cross_validate, KFold

cv = cross_validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
def modelEval(mod, x, y):
  score = cv(mod, x, y)

  print("Average fitting-time: ", score['fit_time'].mean(),
        "\nAverage score-time: ", score['score_time'].mean(),
        "\nAverage test-score (MSE): ", score['test_score'].mean())

In [12]:
from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor(random_state=36)
DTR.fit(xtrainStd, ytrain)
DTpred = DTR.predict(xtestStd)
mean_squared_error(ytest, DTpred)

0.028331980282483564

In [13]:
modelEval(DTR, xtrainStd, ytrain)

Average fitting-time:  25.035493469238283 
Average score-time:  0.06400494575500489 
Average test-score (MSE):  0.9931577678355511


In [14]:
from sklearn.metrics import mean_absolute_error, r2_score

In [15]:
mean_absolute_error(ytest, DTpred)

0.025823058197123754

In [16]:
r2_score(ytest, DTpred)

0.9776019117530825