In [193]:
# Important necessary libraries, tools and frameworks 
import pandas as pd
import numpy as np 
from sklearn import preprocessing 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import metrics 

# Gets directory in order to obtain location of dataset 
import os 
working_directory = os.getcwd() 

In [194]:
# Import dataset 
df = pd.read_csv(working_directory + "/waterchem.csv")

# Fill NaN values 
df = df.fillna(0)

In [195]:
# Set features and target variable
# "SO4_minus2","NO3_minus","CL","Fluor","ANC.ueq.L","DIC","DOC","SiO2","Calcium","Mg","Sodium","Potassium","NH4_plus","AL_TD.ug.L","AL_TM.ug.L","AL_OM.ug.L","AL_IM.ug.L","pH","Color.PtCo","conduct.uS.cm"
# feature_cols = ['SO4_minus2','NO3_minus','Calcium', 'Color.PtCo', 'NH4_plus']
feature_cols = ['Color.PtCo', 'pH']
X = df[feature_cols]
y = df.NO3_minus

print(X)
print(y)

# Split data into training set and testing set (8:2 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



      Color.PtCo    pH
0           20.0  5.42
1           20.0  5.23
2           20.0  5.20
3           25.0  5.28
4           25.0  5.12
...          ...   ...
6277        15.0  7.43
6278        15.0  7.33
6279        25.0  7.26
6280        20.0  7.31
6281        30.0  7.10

[6282 rows x 2 columns]
0       1.137
1       1.450
2       1.822
3       1.968
4       2.007
        ...  
6277    0.771
6278    0.317
6279    0.404
6280    0.705
6281    1.131
Name: NO3_minus, Length: 6282, dtype: float64


In [196]:
# Create model -- Random Forest
rf = RandomForestRegressor(max_depth=2, random_state=42)
rf.fit(X_train, y_train)

# Make predictions 
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

In [197]:
# Model Performance 


# Mean Squared Error 
rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)


# Root MSE
rf_train_rmse = mean_squared_error(y_train, y_rf_train_pred, squared=False)
rf_test_rmse = mean_squared_error(y_test, y_rf_test_pred, squared=False)


# Mean Absolute Error
rf_train_mae = mean_absolute_error(y_train, y_rf_train_pred)
rf_test_mae = mean_absolute_error(y_test, y_rf_test_pred)


# Print out results
rf_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_test_mse, rf_train_rmse, rf_test_rmse, rf_train_mae, rf_test_mae]).transpose()
rf_results.columns = ['Method', 'Training MSE','Test MSE', 'Training RMSE', 'Test RMSE', 'Training MAE', 'Test MAE']

print(rf_results)

          Method Training MSE  Test MSE Training RMSE Test RMSE Training MAE  \
0  Random Forest     0.613225  0.634356      0.783087  0.796465     0.572574   

   Test MAE  
0  0.579216  


In [198]:
# Use the model 

# Define data instance
Xnew = [[5.0, 14.0]]

# Make a prediction
ynew = rf.predict(Xnew)

# Show the inputs and predicted outputs
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))



X=[5.0, 14.0], Predicted=0.8447801190377809
