In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:


# ----------------------------
# 1. Load Dataset
# ----------------------------
def load_iris_dataset():
    iris = load_iris(as_frame=True)
    df = iris.frame
    df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
    return df

# ----------------------------
# 2. Preprocess Data
# ----------------------------
def preprocess_data(df):
    # Handle missing values (check and drop if any)
    print("Missing values per column:")
    print(df.isnull().sum())

    df = df.dropna()

    # Normalize numeric features
    scaler = MinMaxScaler()
    numeric_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Encode species (One-Hot)
    encoder = OneHotEncoder(sparse_output=False)
    encoded_species = encoder.fit_transform(df[["species"]])
    encoded_df = pd.DataFrame(encoded_species, columns=encoder.get_feature_names_out(["species"]))
    
    df = pd.concat([df.drop(columns=["species"]), encoded_df], axis=1)
    return df

# ----------------------------
# 3. Exploration
# ----------------------------
def explore_data(original_df):
    print("\nSummary statistics:")
    print(original_df.describe())

    # Pairplot
    sns.pairplot(original_df, hue="species")
    plt.savefig("pairplot_iris.png")
    plt.close()

    # Correlation heatmap
    plt.figure(figsize=(8,6))
    sns.heatmap(original_df.iloc[:, :-1].corr(), annot=True, cmap="coolwarm")
    plt.title("Feature Correlation Heatmap")
    plt.savefig("heatmap_iris.png")
    plt.close()

    # Boxplots for outlier detection
    plt.figure(figsize=(10,6))
    original_df.drop(columns=["species"]).boxplot()
    plt.title("Boxplots for Outlier Detection")
    plt.savefig("boxplots_iris.png")
    plt.close()

# ----------------------------
# 4. Train/Test Split
# ----------------------------
def split_data(df):
    X = df.drop(columns=df.columns[-3:])  # All except species one-hot columns
    y = df[df.columns[-3:]]  # Species one-hot columns
    return train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# Main Execution
# ----------------------------
if __name__ == "__main__":
    # Load original dataset
    original_df = load_iris_dataset()

    # Explore original data
    explore_data(original_df)

    # Preprocess
    processed_df = preprocess_data(original_df)

    # Split
    X_train, X_test, y_train, y_test = split_data(processed_df)

    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    # Save processed data
    processed_df.to_csv("iris_processed.csv", index=False)
    print("Processed dataset saved to iris_processed.csv")



Summary statistics:
       sepal_length  sepal_width  petal_length  petal_width     species
count    150.000000   150.000000    150.000000   150.000000  150.000000
mean       5.843333     3.057333      3.758000     1.199333    1.000000
std        0.828066     0.435866      1.765298     0.762238    0.819232
min        4.300000     2.000000      1.000000     0.100000    0.000000
25%        5.100000     2.800000      1.600000     0.300000    0.000000
50%        5.800000     3.000000      4.350000     1.300000    1.000000
75%        6.400000     3.300000      5.100000     1.800000    2.000000
max        7.900000     4.400000      6.900000     2.500000    2.000000
Missing values per column:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64
Train shape: (120, 4), Test shape: (30, 4)
Processed dataset saved to iris_processed.csv
