In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("power_plant_dataset.csv")

In [None]:
# --- Step 1: Visualize missing values using missingno ---
plt.figure(figsize=(10,6))
msno.matrix(df)
plt.title("Missing Values Visualization")
plt.show()

In [None]:
# --- Step 2: Visualize the dataset with a pairplot ---
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

plt.figure(figsize=(12, 8))
sns.pairplot(df[numerical_cols])
plt.suptitle("Scatter Plot Matrix of Numerical Features", size=16)
plt.show()

In [None]:
# Impute missing values using mean for numerical columns
imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [None]:
# --- Step 4: Handle Duplicates ---
df = df.drop_duplicates()# --- Step 5: Handle Outliers ---
# Calculate Z-scores to identify outliers
z_scores = np.abs(zscore(df[numerical_cols]))
df_no_outliers = df[(z_scores < 3).all(axis=1)]  # Removing rows with any outliers

In [None]:
# --- Step 6: Handle Noise ---
# Apply rolling median to smooth noisy data
for col in numerical_cols:
    df_no_outliers[col] = df_no_outliers[col].rolling(window=3, min_periods=1).median()

In [None]:
# --- Step 7: Scaling ---
scaler = StandardScaler()
df_no_outliers[numerical_cols] = scaler.fit_transform(df_no_outliers[numerical_cols])


In [None]:
# --- Step 8: Final Preprocessed Dataset ---
# Save the cleaned and preprocessed dataset
df_no_outliers.to_csv("power_plant_preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as 'power_plant_preprocessed_dataset.csv'")

In [None]:
# --- Step 9: Visualize Missing Values After Preprocessing ---
plt.figure(figsize=(10,6))
msno.matrix(df_no_outliers)
plt.title("Missing Values After Preprocessing")
plt.show()

In [None]:
# --- Step 10: Model Training ---
# Separate features and target
X = df_no_outliers.drop(["Power_Plant_ID", "Fuel_Type", "Plant_Location"], axis=1)  # Drop non-numeric features
y = df_no_outliers["Efficiency_percent"]  # Target variable

In [None]:
# Split into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train the SVR model (Support Vector Regression)
from sklearn.svm import SVR
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)

In [None]:
# Predict and evaluate the model on preprocessed data
from sklearn.metrics import mean_squared_error, r2_score
y_pred = svr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")