In [181]:
# Important necessary libraries, tools and frameworks 
import pandas as pd
import numpy as np 
from sklearn import preprocessing 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics 

# Gets directory in order to obtain location of dataset 
import os 
working_directory = os.getcwd() 

In [182]:
# Import dataset 
df = pd.read_csv(working_directory + "/waterchem.csv")

# Fill NaN values 
df = df.fillna(0)

In [183]:
# Set features and target variable
# "SO4_minus2","NO3_minus","CL","Fluor","ANC.ueq.L","DIC","DOC","SiO2","Calcium","Mg","Sodium","Potassium","NH4_plus","AL_TD.ug.L","AL_TM.ug.L","AL_OM.ug.L","AL_IM.ug.L","pH","Color.PtCo","conduct.uS.cm"
feature_cols = ['SO4_minus2','NO3_minus','Calcium', 'Color.PtCo', 'NH4_plus']
X = df[feature_cols]
y = df.pH

print(X)
print(y)

# Split data into training set and testing set (8:2 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



      SO4_minus2  NO3_minus  Calcium  Color.PtCo  NH4_plus
0          5.850      1.137    1.910        20.0     0.045
1          6.440      1.450    1.870        20.0     0.086
2          6.320      1.822    1.990        20.0     0.059
3          5.980      1.968    1.980        25.0     0.061
4          5.110      2.007    1.640        25.0     0.045
...          ...        ...      ...         ...       ...
6277       2.902      0.771    3.859        15.0     0.025
6278       2.942      0.317    4.135        15.0     0.020
6279       3.081      0.404    3.759        25.0    -0.004
6280       3.206      0.705    3.806        20.0    -0.005
6281       3.225      1.131    3.449        30.0     0.039

[6282 rows x 5 columns]
0       5.42
1       5.23
2       5.20
3       5.28
4       5.12
        ... 
6277    7.43
6278    7.33
6279    7.26
6280    7.31
6281    7.10
Name: pH, Length: 6282, dtype: float64


In [184]:
# Create model -- Random Forest
rf = RandomForestRegressor(max_depth=2, random_state=42)
rf.fit(X_train, y_train)

# Make predictions 
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

In [185]:
# Model Performance 


# Mean Squared Error 
rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)


# Root MSE
rf_train_rmse = mean_squared_error(y_train, y_rf_train_pred, squared=False)
rf_test_rmse = mean_squared_error(y_test, y_rf_test_pred, squared=False)


# Mean Absolute Error
rf_train_mae = mean_absolute_error(y_train, y_rf_train_pred)
rf_test_mae = mean_absolute_error(y_test, y_rf_test_pred)


# Print out results
rf_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_test_mse, rf_train_rmse, rf_test_rmse, rf_train_mae, rf_test_mae]).transpose()
rf_results.columns = ['Method', 'Training MSE','Test MSE', 'Training RMSE', 'Test RMSE', 'Training MAE', 'Test MAE']

print(rf_results)

          Method Training MSE  Test MSE Training RMSE Test RMSE Training MAE  \
0  Random Forest     0.303613  0.316744      0.551011    0.5628     0.451843   

  Test MAE  
0  0.46098  


In [186]:
# Use the model 

# Define data instance
Xnew = [[3.225, 1.131, 3.449, 30.0, 0.039]]

# Make a prediction
ynew = rf.predict(Xnew)

# Show the inputs and predicted outputs
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))



X=[3.225, 1.131, 3.449, 30.0, 0.039], Predicted=6.820043056045936
