# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


# Load Data
df = pd.read_excel(r"C:\Users\moses\projects\capstone_app\Group15_Soil_dataset.xlsx")

In [2]:
# Rename Columns
column_mapping = {
    "Cd (mg/kg)": "Cd_value",
    "Cr (mg/kg)": "Cr_value",
    "Ni (mg/kg)": "Ni_value",
    "Pb (mg/kg)": "Pb_value",
    "Zn (mg/kg)": "Zn_value",
    "Cu (mg/kg)": "Cu_value",
    "Co (mg/kg)": "Co_value",
    "Pollution Level": "Contamination"
}
df = df.rename(columns=column_mapping)

In [3]:
# Clean DataFrame
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.fillna(df.mean(numeric_only=True), inplace=True)

# Define Drop Columns
drop_columns = ['Contamination Factor (Cd)', 'pH (H2O)', 'Contamination Factor (Cr)', 'Contamination Factor (Ni)', 
                'Contamination Factor (Pb)', 'Contamination Factor (Zn)', 'Contamination Factor (Cu)', 
                'Contamination Factor (Co)', 'Contamination', 'mCdeg']


In [4]:
# Define Features and Labels
X = df.drop(drop_columns, axis=1)
y = df['Contamination']

# Encode Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Test the Model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.9852941176470589


In [5]:
latitude_value = -30.7128
longitude_value = 26.0060
Cd_value = 5.0
Cr_value = 5.0
Ni_value = 5.0
Pb_value = 5.0
Zn_value = 5.0
Cu_value = 5.0
Co_value = 5.0

# Create a numpy array with the high values
X_new = pd.DataFrame([[latitude_value, longitude_value, Cd_value, Cr_value, Ni_value, Pb_value, Zn_value, Cu_value, Co_value]],
                     columns=['Latitude', 'Longitude', 'Cd_value', 'Cr_value', 'Ni_value', 'Pb_value', 'Zn_value', 'Cu_value', 'Co_value'])


# Make a prediction
y_pred_new = rf_model.predict(X_new)

# Inverse transform the prediction to get the original label
predicted_label = label_encoder.inverse_transform(y_pred_new)

print(f'Predicted Contamination Level: {predicted_label[0]}')

Predicted Contamination Level: very low contamination


In [6]:
import pickle

# Save the model using pickle
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
    
with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import geopandas as gpd
import matplotlib.pyplot as plt
import pickle

# Load Data
df = pd.read_excel(r"C:\Users\moses\Downloads\Group15_Soil_dataset.xlsx")

# Rename Columns
column_mapping = {
    "Cd (mg/kg)": "Cd_value",
    "Cr (mg/kg)": "Cr_value",
    "Ni (mg/kg)": "Ni_value",
    "Pb (mg/kg)": "Pb_value",
    "Zn (mg/kg)": "Zn_value",
    "Cu (mg/kg)": "Cu_value",
    "Co (mg/kg)": "Co_value",
    "Pollution Level": "Contamination"
}
df = df.rename(columns=column_mapping)

# Clean DataFrame
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.fillna(df.mean(numeric_only=True), inplace=True)

# Define Drop Columns
drop_columns = ['Contamination Factor (Cd)', 'pH (H2O)', 'Contamination Factor (Cr)', 'Contamination Factor (Ni)', 
                'Contamination Factor (Pb)', 'Contamination Factor (Zn)', 'Contamination Factor (Cu)', 
                'Contamination Factor (Co)', 'Contamination', 'mCdeg']

# Define Features and Labels
X = df.drop(drop_columns, axis=1)
y = df['Contamination']

# Encode Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Test the Model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

latitude_value = -30.7128
longitude_value = 26.0060
Cd_value = 5.0
Cr_value = 5.0
Ni_value = 5.0
Pb_value = 5.0
Zn_value = 5.0
Cu_value = 5.0
Co_value = 5.0

# Create a numpy array with the high values
X_new = pd.DataFrame([[latitude_value, longitude_value, Cd_value, Cr_value, Ni_value, Pb_value, Zn_value, Cu_value, Co_value]],
                     columns=['Latitude', 'Longitude', 'Cd_value', 'Cr_value', 'Ni_value', 'Pb_value', 'Zn_value', 'Cu_value', 'Co_value'])


# Make a prediction
y_pred_new = rf_model.predict(X_new)

# Inverse transform the prediction to get the original label
predicted_label = label_encoder.inverse_transform(y_pred_new)

print(f'Predicted Contamination Level: {predicted_label[0]}')

# Save the model using pickle
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
    
with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

# Load the model using pickle
with open('rf_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

: 