In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
df = pd.read_csv('.\\datasets\\cleaned_output_data_2.csv')

# Display the first few rows to check the data
print(df.head())

   Dissolved_Oxygen  salinities  \
0              14.0        38.5   
1              16.5         2.9   
2               4.4        18.0   
3               7.8         2.9   
4              14.0        41.5   

                                 Aquatic_Environment  
0  Very High Oxygen, High Salinity Sea - Fish: Tu...  
1  Very High Oxygen, Low Salinity Wetland - Fish:...  
2  Low Oxygen, Medium Salinity Tropical Lake - Fi...  
3  Moderate Oxygen, Freshwater River - Fish: Trou...  
4  Very High Oxygen, High Salinity Sea - Fish: Tu...  


In [3]:
df = df.dropna()

In [4]:
df

Unnamed: 0,Dissolved_Oxygen,salinities,Aquatic_Environment
0,14.0,38.5,"Very High Oxygen, High Salinity Sea - Fish: Tu..."
1,16.5,2.9,"Very High Oxygen, Low Salinity Wetland - Fish:..."
2,4.4,18.0,"Low Oxygen, Medium Salinity Tropical Lake - Fi..."
3,7.8,2.9,"Moderate Oxygen, Freshwater River - Fish: Trou..."
4,14.0,41.5,"Very High Oxygen, High Salinity Sea - Fish: Tu..."
...,...,...,...
42812,0.0,2.9,"Very Low Oxygen, Freshwater Pond - Fish: Carp,..."
42813,17.0,3.3,"Very High Oxygen, Low Salinity Wetland - Fish:..."
42814,5.4,4.2,"Moderate Oxygen, Low Salinity Freshwater Lake ..."
42815,15.4,3.1,"Very High Oxygen, Low Salinity Wetland - Fish:..."


In [5]:
X = df[['Dissolved_Oxygen', 'salinities']]
y = df['Aquatic_Environment']
# Check for missing data
print(df.isnull().sum())

Dissolved_Oxygen       0
salinities             0
Aquatic_Environment    0
dtype: int64


In [6]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
# Predict on the training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy Score:", train_accuracy)

Training Accuracy Score: 1.0


In [10]:
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy Score: 1.0

Classification Report:
                                                                                  precision    recall  f1-score   support

                  High Oxygen, Brackish Estuary - Fish: Sea Trout, Striped Bass       1.00      1.00      1.00       349
                  High Oxygen, Low Salinity Stream - Fish: Rainbow Trout, Perch       1.00      1.00      1.00      3514
               High Oxygen, Medium Salinity Oceanic Zone - Fish: Tuna, Sardines       1.00      1.00      1.00       296
                   Low Oxygen, Brackish Water Lagoon - Fish: Flounder, Sea Bass       1.00      1.00      1.00       156
                       Low Oxygen, Coastal Salt Marsh - Fish: Mullets, Flatfish       1.00      1.00      1.00        29
              Low Oxygen, High Salinity Coastal Waters - Fish: Haddock, Halibut       1.00      1.00      1.00       565
      Low Oxygen, Hypersaline Salt Lake - Fish: Artemia (Brine Shrimp), Tilapia       1.00      1.00      1

In [11]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y_encoded, cv=5)  # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

Cross-Validation Scores: [0.99953293 0.99976646 1.         0.99988322 0.99953287]
Mean Cross-Validation Score: 0.9997430970596097


In [12]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Get predictions from cross-validation
from sklearn.model_selection import cross_val_predict

y_pred_cv = cross_val_predict(model, X, y_encoded, cv=5)

# Compute confusion matrix and additional metrics
conf_matrix = confusion_matrix(y_encoded, y_pred_cv)
precision = precision_score(y_encoded, y_pred_cv, average='weighted')
recall = recall_score(y_encoded, y_pred_cv, average='weighted')
f1 = f1_score(y_encoded, y_pred_cv, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1)

Confusion Matrix:
 [[ 1146     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0 11482     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0  1093     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0   547     0     0     0     2     0     0     0     1
      0     0     0     0     0     0     0]
 [    0     0     0     2   106     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0  1820     0     0     0     1     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0   707     0     0     0     0     0
      0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0   309     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0    

In [13]:
def predict_plant_type(Dissolved_Oxygen, Sanity):
    # Validate the input ranges
    if Dissolved_Oxygen < 0 or Dissolved_Oxygen > 20:
        return 'Invalid Dissolved Oxygen value'
    if Sanity < 0 or Sanity > 60:
        return 'Invalid Sanity value'

    # Predict the plant type using the trained model
    prediction = model.predict([[Dissolved_Oxygen, Sanity]])
    return prediction[0]

# Test the function with sample inputs
Dissolved_Oxygen = 8
Sanity = 50
print(f"Predicted plant type: {predict_plant_type(Dissolved_Oxygen, Sanity)}")


Predicted plant type: 9




In [14]:
def predict_plant_type(Dissolved_Oxygen, Sanity):
    # Validate the input ranges
    if Dissolved_Oxygen < 0 or Dissolved_Oxygen > 20:
        return 'Invalid Dissolved Oxygen value'
    if Sanity < 0 or Sanity > 60:
        return 'Invalid Sanity value'

    # Predict the plant type using the trained model
    prediction_encoded = model.predict([[Dissolved_Oxygen, Sanity]])
    
    # Convert the numeric prediction back to the plant type label
    predicted_label = le.inverse_transform(prediction_encoded)
    
    return predicted_label[0]

# Test the function with sample inputs
Dissolved_Oxygen = 8
Sanity = 50
print(f"Predicted plant type: {predict_plant_type(Dissolved_Oxygen, Sanity)}")


Predicted plant type: Moderate Oxygen, High Salinity Coral Reef - Fish: Clownfish, Parrotfish




In [15]:
with open('hydrosphere_model.pickle','wb') as f:
    pickle.dump(model,f)

In [16]:

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)  # y_train should be your actual string labels


In [19]:
import json
columns={
    'data_columns':[col.lower() for col in X.columns]
}
with open("columns.json","w")as f:
    f.write(json.dumps(columns))