In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import joblib 

In [None]:
# Load the dataset
df = pd.read_csv('water_quality_dataset.csv')

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Display distribution of Air Quality
print(df['Result'].value_counts())
plt.figure(figsize=(6, 4))
sns.histplot(df['Result'], bins=30, kde=True)
plt.title('Detect Voltage Problems')
plt.xlabel('Noraml Or Problem(Sag,Swell)')
plt.ylabel('Sensor Data Count')
plt.show()

In [None]:
# Filter out non-numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=['number'])

# Calculate and display the correlation matrix
print("\nCorrelation Matrix:")
correlation_matrix = numeric_df.corr()
print(correlation_matrix)

# Plotting the heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap', fontsize=14)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize distributions of individual features
plt.figure(figsize=(15, 6))
columns = ['TDS', 'Turbidity', 'pH', 'Temperature', 'Humidity']

for i, column in enumerate(columns, start=1):
    plt.subplot(1, 5, i)  # 1 row, 4 columns, subplot index i
    sns.histplot(df[column], bins=30, kde=True)
    plt.title(column)
    plt.xlabel('Voltage')
    plt.ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:
# Encode the target variable
label_encoder = LabelEncoder()
df['Result'] = label_encoder.fit_transform(df['Result'])


In [None]:
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

In [None]:
# Split the data into features and target
X = df.drop(columns=['Result'])
y = df['Result']


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf) * 100  # Convert to percentage
print(f"Random Forest Accuracy: {rf_accuracy:.2f}%")
print(classification_report(y_test, y_pred_rf))


In [None]:
# Save the trained models and scaler
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test_scaled)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred_gb) * 100  # Convert to percentage
print(f"Gradient Boosting Accuracy: {gb_accuracy:.2f}%")
print(classification_report(y_test, y_pred_gb))


In [None]:
import pandas as pd
import joblib  # For loading .pkl files

# Load the trained Random Forest model, scaler, and label encoder
rf_model = joblib.load('rf_model.pkl')  
scaler = joblib.load('scaler.pkl')     
label_encoder = joblib.load('label_encoder.pkl')  

def disaster(TDS, Turbidity, pH, Temperature, Humidity):
    
    # Create a DataFrame from the input values
    new_data = pd.DataFrame({
        'TDS': [TDS],
        'Turbidity': [Turbidity],
        'pH': [pH],
        'Temperature': [Temperature],
        'Humidity' : [Humidity]
    })
    
    # Scale the new data using the loaded scaler
    new_data_scaled = scaler.transform(new_data)
    
    # Make predictions using the loaded Random Forest model
    dt_prediction = rf_model.predict(new_data_scaled)
    
    # Convert numerical prediction back to the original label
    result = label_encoder.inverse_transform(dt_prediction)
    
    return result[0]

# Main function to get input from the user and display the prediction
if __name__ == "__main__":
    print("Enter sensor values for prediction:")
    
    try:
        # Get user input
        TDS = float(input("TDS: "))
        Turbidity = float(input("Turbidity: "))
        pH = float(input("pH: "))
        Temperature = float(input("Temperature: "))
        Humidity = float(input("Humidity: "))
        
        # Predict the disaster type
        predicted_result = disaster(TDS, Turbidity, pH, Temperature, Humidity)
        
        # Display the prediction
        print("\nPredicted Result:", predicted_result)
    except Exception as e:
        print("\nError:", str(e))
