### Task 1: Handling Missing Values - Simple Imputation
**Description**: Given a dataset with missing values, impute the missing values using the mean for numerical features and the mode for categorical features.

In [None]:
# write your code from here

### Task 2: Feature Scaling - Min-Max Normalization
**Description**: Normalize a numerical feature using Min-Max scaling to a range [0, 1].

In [None]:
# write your code from here

### Task 3: Handling Missing Values - Drop Missing Values
**Description**: Remove rows with missing values from a dataset.

In [None]:
# write your code from here

### Task 4: Feature Scaling - Standardization
**Description**: Standardize a numerical feature to have zero mean and unit variance.

In [None]:
# write your code from here

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk
from scipy.stats import kstest # Import kstest for Kolmogorov-Smirnov test
from sklearn.metrics import roc_auc_score, accuracy_score # Import metrics for adversarial validation
import shap # Import SHAP library for explainability
from sklearn.ensemble import RandomForestClassifier # Using RandomForest for SHAP demonstration
from scipy.stats import chi2_contingency # Import for Chi-squared test


# Download necessary NLTK data (if not already downloaded)
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')
try:
    nltk.data.find('stemmers/porter')
except nltk.downloader.DownloadError:
    nltk.download('punkt') # punkt is needed for tokenization, which stemming implicitly uses


print("Simulating concept drift and preparing visualization...")

# --- Parameters for Concept Drift Simulation ---
num_time_periods = 50  # Number of simulated time periods
samples_per_period = 100 # Number of data points in each time period
base_probability = 0.3  # Initial probability of the binary target (e.g., P(Y=1))
drift_magnitude = 0.01 # How much the probability changes per period
drift_start_period = 15 # When the drift starts
drift_end_period = 40   # When the drift ends

# --- Simulate Concept Drift ---
# This list will store the true probability of the target variable for each period
true_probabilities = []
# This list will store the observed frequency of the target variable for each period
observed_frequencies = []

for i in range(num_time_periods):
    current_probability = base_probability

    # Introduce concept drift between drift_start_period and drift_end_period
    if drift_start_period <= i < drift_end_period:
        # Gradually increase the probability over time
        current_probability = base_probability + (i - drift_start_period) * drift_magnitude
        # Ensure probability stays within [0, 1]
        current_probability = np.clip(current_probability, 0.0, 1.0)
    elif i >= drift_end_period:
        # Keep the probability constant after the drift ends (at its peak value)
        current_probability = base_probability + (drift_end_period - drift_start_period) * drift_magnitude
        current_probability = np.clip(current_probability, 0.0, 1.0)

    true_probabilities.append(current_probability)

    # Simulate binary data for the current period based on current_probability
    # np.random.binomial(n, p, size) generates n trials with probability p, repeated 'size' times.
    # Here, n=1 for a single binary outcome, and size=samples_per_period for multiple samples.
    simulated_data = np.random.binomial(1, current_probability, samples_per_period)

    # Calculate the observed frequency (proportion of 1s) in the simulated data
    observed_frequency = np.mean(simulated_data)
    observed_frequencies.append(observed_frequency)

# --- Visualize the Concept Drift ---
plt.figure(figsize=(12, 6)) # Set the figure size for better readability
plt.plot(range(num_time_periods), true_probabilities, label='True Probability (Simulated Drift)', color='blue', linestyle='--', marker='o', markersize=4)
plt.plot(range(num_time_periods), observed_frequencies, label='Observed Frequency', color='red', alpha=0.7, marker='x', markersize=4)

plt.title('Simulated Concept Drift in Binary Target Variable Over Time', fontsize=16)
plt.xlabel('Time Period', fontsize=12)
plt.ylabel('Probability / Frequency of Target (Y=1)', fontsize=12)
plt.grid(True, linestyle=':', alpha=0.7) # Add a subtle grid
plt.legend(fontsize=10) # Display the legend
plt.ylim(0, 1) # Ensure y-axis is between 0 and 1 for probabilities
plt.xticks(np.arange(0, num_time_periods + 1, 5)) # Set x-axis ticks for clarity
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show() # Display the plot

print("\nConcept drift simulation and visualization complete. The plot shows how the distribution of the binary target variable changes over time.")


# --- New Section: Detecting & Handling Imbalanced Data ---
print("\n\n--- Detecting & Handling Imbalanced Data: Visualizing Class Imbalance ---")

# Define the URL for the Credit Card Fraud Detection dataset
# This dataset is known for its highly imbalanced classes (very few fraudulent transactions).
credit_card_fraud_url = "https://raw.githubusercontent.com/mlg-ulb/fpg/master/datasets/creditcard.csv"

print("Loading the Credit Card Fraud Detection dataset...")
try:
    # Load the dataset into a pandas DataFrame
    # The dataset is comma-separated and has a header.
    df_fraud = pd.read_csv(credit_card_fraud_url)
    print("Credit Card Fraud Detection dataset loaded successfully.")
    print("\nFirst 5 rows of the fraud detection dataset:")
    print(df_fraud.head())

    print("\n--- Initial Class Distribution ---")
    # The target variable is 'Class' (0: Non-Fraud, 1: Fraud)
    class_distribution = df_fraud['Class'].value_counts()
    print("Absolute counts of each class:")
    print(class_distribution)

    # Calculate and print the percentage of each class
    class_percentage = df_fraud['Class'].value_counts(normalize=True) * 100
    print("\nPercentage of each class:")
    print(class_percentage.round(2))

    # --- Visualize Class Imbalance ---
    plt.figure(figsize=(8, 6))
    sns.countplot(x='Class', data=df_fraud, palette='viridis')
    plt.title('Class Distribution in Original Credit Card Fraud Dataset', fontsize=14)
    plt.xlabel('Class (0: Non-Fraud, 1: Fraud)', fontsize=12)
    plt.ylabel('Number of Transactions', fontsize=12)
    plt.xticks(ticks=[0, 1], labels=['Non-Fraud (0)', 'Fraud (1)'])
    plt.grid(axis='y', linestyle=':', alpha=0.7)
    plt.tight_layout()
    plt.show()
    print("Class imbalance visualized. The plot clearly shows the disproportionate number of non-fraudulent vs. fraudulent transactions.")

    print("\n--- Applying Random Undersampling to Balance the Dataset ---")

    # Separate features (X) and target (y)
    X = df_fraud.drop('Class', axis=1)
    y = df_fraud['Class']

    # Identify indices of majority and minority classes
    # Class 0 is the majority class, Class 1 is the minority class
    fraud_indices = np.array(df_fraud[df_fraud.Class == 1].index)
    non_fraud_indices = np.array(df_fraud[df_fraud.Class == 0].index)

    # Determine the number of samples in the minority class (fraudulent transactions)
    number_of_fraud_samples = len(fraud_indices)
    print(f"Number of fraudulent transactions (minority class): {number_of_fraud_samples}")

    # Randomly select 'number_of_fraud_samples' from the non-fraudulent class
    # np.random.choice is used to pick random indices without replacement
    random_non_fraud_indices = np.random.choice(
        non_fraud_indices,
        number_of_fraud_samples,
        replace=False # Important: do not pick the same sample multiple times
    )
    print(f"Randomly selected {len(random_non_fraud_indices)} non-fraudulent samples.")

    # Concatenate the indices of the minority class and the randomly selected majority class
    undersampled_indices = np.concatenate([fraud_indices, random_non_fraud_indices])

    # Create the undersampled DataFrame
    df_undersampled = df_fraud.loc[undersampled_indices]

    print("\n--- Class Distribution After Random Undersampling ---")
    # Check the class distribution in the undersampled dataset
    undersampled_class_distribution = df_undersampled['Class'].value_counts()
    print("Absolute counts of each class after undersampling:")
    print(undersampled_class_distribution)

    # Calculate and print the percentage of each class after undersampling
    undersampled_class_percentage = df_undersampled['Class'].value_counts(normalize=True) * 100
    print("\nPercentage of each class after undersampling:")
    print(undersampled_class_percentage.round(2))

    # --- Visualize Class Distribution After Undersampling ---
    plt.figure(figsize=(8, 6))
    sns.countplot(x='Class', data=df_undersampled, palette='viridis')
    plt.title('Class Distribution After Random Undersampling', fontsize=14)
    plt.xlabel('Class (0: Non-Fraud, 1: Fraud)', fontsize=12)
    plt.ylabel('Number of Transactions', fontsize=12)
    plt.xticks(ticks=[0, 1], labels=['Non-Fraud (0)', 'Fraud (1)'])
    plt.grid(axis='y', linestyle=':', alpha=0.7)
    plt.tight_layout()
    plt.show()
    print("Class distribution after random undersampling visualized. The classes are now balanced.")

except Exception as e:
    print(f"An error occurred while processing the Credit Card Fraud Detection dataset: {e}")
    print("Please ensure the dataset URL is correct and accessible, and that necessary libraries are installed.")


# --- New Section: Ensuring Consistency Across Training & Inference Datasets: Feature Scaling ---
print("\n\n--- Ensuring Consistency Across Training & Inference Datasets: Feature Scaling ---")

print("Generating a synthetic dataset for demonstrating feature scaling...")

# Generate a synthetic dataset
# We'll create a simple dataset with two features for demonstration
np.random.seed(42) # for reproducibility
num_samples = 200
data = {
    'Feature1': np.random.normal(loc=50, scale=10, size=num_samples),
    'Feature2': np.random.normal(loc=1000, scale=200, size=num_samples)
}
df_scaling = pd.DataFrame(data)

print("\nOriginal synthetic dataset (first 5 rows):")
print(df_scaling.head())
print("\nOriginal synthetic dataset descriptive statistics:")
print(df_scaling.describe().round(2))

# Split the dataset into training and 'new' (inference) data
# This simulates having a dataset on which you train your model, and then new data comes in for prediction.
X_train, X_new_inference = train_test_split(df_scaling, test_size=0.3, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"New inference data shape: {X_new_inference.shape}")

print("\n--- Applying StandardScaler to Training Data ---")
# Initialize the StandardScaler
# StandardScaler transforms data to have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
# The .fit() method calculates the mean and standard deviation from the training data.
# The .transform() method then applies these calculated values to scale the data.
X_train_scaled = scaler.fit_transform(X_train)

# Convert the scaled array back to a DataFrame for better readability
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

print("\nScaled training data (first 5 rows):")
print(X_train_scaled_df.head())
print("\nScaled training data descriptive statistics (should have mean ~0, std ~1):")
print(X_train_scaled_df.describe().round(2))


print("\n--- Applying the SAME Scaler to New Inference Data ---")
# Apply the *already fitted* scaler to the new inference data
# It is CRUCIAL to use the same scaler (i.e., the one fitted on training data)
# to transform new data. This ensures consistency in scaling.
X_new_inference_scaled = scaler.transform(X_new_inference)

# Convert the scaled array back to a DataFrame
X_new_inference_scaled_df = pd.DataFrame(X_new_inference_scaled, columns=X_new_inference.columns)

print("\nScaled new inference data (first 5 rows):")
print(X_new_inference_scaled_df.head())
print("\nScaled new inference data descriptive statistics:")
print(X_new_inference_scaled_df.describe().round(2))

print("\nFeature scaling demonstration complete. Notice that the same scaler object was used for both training and new inference data to maintain consistency.")
print("This ensures that your model, trained on scaled data, receives new data in the same expected format.")


# --- New Section: Bias & Fairness in Data: Bias Mitigation Techniques (Reweighing) ---
print("\n\n--- Bias & Fairness in Data: Bias Mitigation Techniques (Reweighing) ---")

# Define the URL for the Adult Income dataset
adult_income_url_reweigh = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define the column names for the Adult Income dataset
column_names_reweigh = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

print("Loading the Adult Income dataset for bias mitigation...")
try:
    df_adult = pd.read_csv(adult_income_url_reweigh, sep=", ", header=None, names=column_names_reweigh, na_values=["?"], engine='python')
    print("Adult Income dataset loaded successfully.")

    # Clean the 'sex' and 'income' columns by stripping whitespace
    for col in ['sex', 'income']:
        if col in df_adult.columns and df_adult[col].dtype == 'object':
            df_adult[col] = df_adult[col].str.strip()

    # Drop rows with any missing values for a clean demonstration
    df_adult.dropna(inplace=True)
    print(f"Dataset shape after dropping missing values: {df_adult.shape}")

    print("\n--- Original Distribution of Income by Gender ---")
    original_distribution = pd.crosstab(df_adult['sex'], df_adult['income'], margins=True, normalize=False)
    print("Absolute counts:")
    print(original_distribution)

    original_percentage = pd.crosstab(df_adult['sex'], df_adult['income'], margins=True, normalize='index') * 100
    print("\nRow-wise percentages (Income distribution within each Gender):")
    print(original_percentage.round(2))

    # --- Apply Reweighing Technique ---
    # Goal: Make the proportion of income classes (especially '>50K') more similar across genders.
    # Calculate overall probabilities of income classes
    p_y_le50k = df_adult['income'].value_counts(normalize=True)['<=50K']
    p_y_gt50k = df_adult['income'].value_counts(normalize=True)['>50K']

    # Initialize a new column for weights
    df_adult['sample_weight'] = 1.0

    # Calculate weights based on the formula: weight(s, y) = P(Y=y) / P(Y=y | S=s)
    # This aims to make the joint distribution P(S, Y) proportional to P(S)P(Y)
    # effectively making S and Y independent in the weighted dataset.

    # Iterate through unique combinations of 'sex' and 'income'
    for sex_val in df_adult['sex'].unique():
        for income_val in df_adult['income'].unique():
            # Probability of income_val given sex_val: P(Y=y | S=s)
            p_y_given_s = len(df_adult[(df_adult['sex'] == sex_val) & (df_adult['income'] == income_val)]) / len(df_adult[df_adult['sex'] == sex_val])

            # Overall probability of income_val: P(Y=y)
            p_y = df_adult['income'].value_counts(normalize=True)[income_val]

            # Calculate weight for this specific group
            # Avoid division by zero if a group has no samples
            if p_y_given_s > 0:
                weight = p_y / p_y_given_s
            else:
                weight = 0 # Assign 0 weight if the group doesn't exist

            # Assign calculated weight to the corresponding samples
            df_adult.loc[(df_adult['sex'] == sex_val) & (df_adult['income'] == income_val), 'sample_weight'] = weight

    print("\n--- Distribution of Income by Gender After Reweighing (Weighted Counts) ---")
    # Calculate weighted counts
    weighted_distribution = df_adult.groupby(['sex', 'income'])['sample_weight'].sum().unstack(fill_value=0)
    print("Weighted counts:")
    print(weighted_distribution)

    # Calculate weighted percentages
    # Sum of weights for each gender group
    total_weight_female = weighted_distribution.loc['Female'].sum()
    total_weight_male = weighted_distribution.loc['Male'].sum()

    # Calculate weighted percentages within each gender group
    weighted_percentage_female = (weighted_distribution.loc['Female'] / total_weight_female) * 100
    weighted_percentage_male = (weighted_distribution.loc['Male'] / total_weight_male) * 100

    weighted_percentage_df = pd.DataFrame([weighted_percentage_female, weighted_percentage_male], index=['Female', 'Male'])
    print("\nRow-wise weighted percentages (Income distribution within each Gender after reweighing):")
    print(weighted_percentage_df.round(2))

    print("\n--- Visualizing the Impact of Reweighing ---")

    fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

    # Plot original distribution
    sns.countplot(x='sex', hue='income', data=df_adult, ax=axes[0], palette='coolwarm')
    axes[0].set_title('Original Income Distribution by Gender', fontsize=14)
    axes[0].set_xlabel('Gender', fontsize=12)
    axes[0].set_ylabel('Number of Samples', fontsize=12)
    axes[0].legend(title='Income')
    axes[0].grid(axis='y', linestyle=':', alpha=0.7)


    # Plot weighted distribution (using the calculated weights for visualization)
    # For visualization, we can use a bar plot of the weighted counts
    weighted_distribution.plot(kind='bar', ax=axes[1], color=['#66c2a5', '#fc8d62']) # Using different colors
    axes[1].set_title('Weighted Income Distribution by Gender', fontsize=14)
    axes[1].set_xlabel('Gender', fontsize=12)
    axes[1].set_ylabel('Weighted Number of Samples', fontsize=12)
    axes[1].legend(title='Income')
    axes[1].tick_params(axis='x', rotation=0) # Ensure x-axis labels are not rotated
    axes[1].grid(axis='y', linestyle=':', alpha=0.7)


    plt.tight_layout()
    plt.show()

    print("\nReweighing has been applied. The weighted distributions should show a more balanced representation of income levels across different genders compared to the original distribution.")
    print("This technique aims to reduce bias by giving more importance to underrepresented groups during model training.")

except Exception as e:
    print(f"An error occurred while processing the Adult Income dataset for bias mitigation: {e}")
    print("Please ensure the dataset URL is correct and accessible, and that necessary libraries are installed.")


# --- New Section: Ensuring Consistency Across Training & Inference Datasets: Pipeline Integration ---
print("\n\n--- Ensuring Consistency Across Training & Inference Datasets: Pipeline Integration ---")

print("Generating a synthetic dataset for demonstrating pipeline integration...")

# Generate a synthetic dataset for classification
np.random.seed(0) # for reproducibility
n_samples = 300
X_pipeline = pd.DataFrame({
    'Feature_A': np.random.normal(loc=100, scale=20, size=n_samples),
    'Feature_B': np.random.normal(loc=5, scale=1.5, size=n_samples)
})
# Create a binary target variable
y_pipeline = (X_pipeline['Feature_A'] + X_pipeline['Feature_B'] * 10 > 150).astype(int)

print("\nOriginal synthetic dataset for pipeline (first 5 rows):")
print(X_pipeline.head())
print("\nOriginal synthetic dataset target distribution:")
print(y_pipeline.value_counts())

# Split data into training and testing sets
X_train_pipeline, X_test_pipeline, y_train_pipeline, y_test_pipeline = train_test_split(
    X_pipeline, y_pipeline, test_size=0.3, random_state=42
)

print(f"\nTraining data shape for pipeline: {X_train_pipeline.shape}")
print(f"Testing data shape for pipeline: {X_test_pipeline.shape}")

print("\n--- Creating and Training a Scikit-learn Pipeline ---")

# Define the pipeline steps:
# 1. StandardScaler: To scale the features
# 2. LogisticRegression: A simple classification model
pipeline = Pipeline([
    ('scaler', StandardScaler()),        # Step 1: Feature Scaling
    ('classifier', LogisticRegression()) # Step 2: Classification Model
])

print("\nPipeline created with steps:")
print(pipeline)

# Train the pipeline on the training data
# When .fit() is called on the pipeline, it sequentially calls fit_transform()
# on all transformers (like StandardScaler) and then fit() on the final estimator.
print("\nTraining the pipeline...")
pipeline.fit(X_train_pipeline, y_train_pipeline)
print("Pipeline training complete.")

print("\n--- Demonstrating Inference with the Trained Pipeline ---")

# Make predictions on the test data using the trained pipeline
# When .predict() is called on the pipeline, it sequentially calls transform()
# on all transformers and then predict() on the final estimator.
# This ensures that the same scaling parameters learned from the training data
# are applied to the test data automatically.
y_pred_pipeline = pipeline.predict(X_test_pipeline)

print("\nFirst 10 actual labels from test set:", y_test_pipeline.head(10).tolist())
print("First 10 predicted labels from test set:", y_pred_pipeline[:10].tolist())

# Evaluate the pipeline's performance (optional, but good practice)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_pipeline, y_pred_pipeline)
print(f"\nAccuracy of the pipeline on the test set: {accuracy:.4f}")

print("\nPipeline integration demonstration complete.")
print("Using a scikit-learn Pipeline ensures that all preprocessing steps (like scaling) are consistently applied to both training and new inference data.")
print("This prevents data leakage and ensures that your model receives data in the exact same format it was trained on.")

# --- New Section: Feature Engineering Best Practices: Handling Text Data ---
print("\n\n--- Feature Engineering Best Practices: Handling Text Data ---")

# Define the URL for the SMS Spam Collection dataset
# This dataset contains SMS messages labeled as 'ham' (legitimate) or 'spam'.
sms_spam_url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/spam/smsspamcollection.zip"

print("Loading the SMS Spam Collection dataset...")
try:
    # The dataset is a zip file containing a tab-separated text file.
    # We need to read it directly from the zip.
    df_sms = pd.read_csv(sms_spam_url, sep='\t', header=None, names=['label', 'message'], encoding='latin-1')
    print("SMS Spam Collection dataset loaded successfully.")
    print("\nFirst 5 rows of the SMS dataset:")
    print(df_sms.head())

    print("\n--- Initial Class Distribution (Spam vs. Ham) ---")
    print(df_sms['label'].value_counts())

    print("\n--- Text Preprocessing ---")

    # Initialize stemmer and stopwords
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        # 1. Lowercase the text
        text = text.lower()
        # 2. Remove punctuation and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        # 3. Tokenize and remove stop words, then stem
        words = text.split()
        words = [stemmer.stem(word) for word in words if word not in stop_words]
        return ' '.join(words)

    # Apply preprocessing to the 'message' column
    print("Applying text preprocessing (lowercasing, removing punctuation/numbers, stop words, stemming)...")
    df_sms['cleaned_message'] = df_sms['message'].apply(preprocess_text)
    print("\nOriginal message vs. Cleaned message (first 5 examples):")
    for i in range(5):
        print(f"Original: {df_sms['message'].iloc[i]}")
        print(f"Cleaned:  {df_sms['cleaned_message'].iloc[i]}\n")

    print("\n--- Feature Extraction using TF-IDF ---")

    # Initialize TF-IDF Vectorizer
    # max_features limits the number of features (vocabulary size)
    # min_df ignores terms that appear in too few documents
    # max_df ignores terms that appear in too many documents
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.8)

    # Fit the vectorizer on the cleaned messages and transform them
    # This converts text data into numerical TF-IDF features
    X_tfidf = tfidf_vectorizer.fit_transform(df_sms['cleaned_message'])

    # Convert the TF-IDF matrix to a DataFrame for easier inspection (optional, for small datasets)
    # For very large datasets, keep it as a sparse matrix for memory efficiency
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    print(f"\nShape of TF-IDF features: {X_tfidf.shape}")
    print("\nFirst 5 rows of TF-IDF features (showing a subset of columns):")
    print(X_tfidf_df.iloc[:5, :10]) # Display first 5 rows and first 10 columns

    print("\nTF-IDF feature extraction complete.")
    print("These numerical features can now be used to train machine learning models for tasks like spam detection.")

except Exception as e:
    print(f"An error occurred while processing the SMS Spam Collection dataset: {e}")
    print("Please ensure the dataset URL is correct and accessible, and that NLTK data (stopwords, punkt) is downloaded.")


# --- New Section: Data Drift: Detection Using Statistical Tests (Kolmogorov-Smirnov) ---
print("\n\n--- Data Drift: Detection Using Statistical Tests (Kolmogorov-Smirnov) ---")

print("Simulating two datasets with a distribution shift for drift detection...")

# Simulate Dataset 1 (Reference/Baseline Data)
# A normal distribution with mean 0 and standard deviation 1
np.random.seed(10) # for reproducibility
dataset1 = np.random.normal(loc=0, scale=1, size=500)

# Simulate Dataset 2 (New/Current Data with a shift)
# A normal distribution with a shifted mean and slightly different standard deviation
dataset2 = np.random.normal(loc=0.5, scale=1.2, size=500)

print(f"\nDataset 1 (Reference) - Mean: {np.mean(dataset1):.2f}, Std Dev: {np.std(dataset1):.2f}")
print(f"Dataset 2 (Current) - Mean: {np.mean(dataset2):.2f}, Std Dev: {np.std(dataset2):.2f}")

# --- Visualize the Distributions ---
plt.figure(figsize=(10, 6))
sns.histplot(dataset1, color='blue', label='Dataset 1 (Reference)', kde=True, stat='density', alpha=0.6)
sns.histplot(dataset2, color='red', label='Dataset 2 (Current)', kde=True, stat='density', alpha=0.6)
plt.title('Distribution of Simulated Datasets', fontsize=14)
plt.xlabel('Value', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend()
plt.grid(axis='y', linestyle=':', alpha=0.7)
plt.tight_layout()
plt.show()

print("\n--- Applying Kolmogorov-Smirnov (K-S) Test ---")
# The Kolmogorov-Smirnov test is a non-parametric test that compares the cumulative
# distribution functions (CDFs) of two samples.
# H0 (Null Hypothesis): The two samples are drawn from the same continuous distribution.
# H1 (Alternative Hypothesis): The two samples are drawn from different distributions.

# Perform the K-S test
statistic, p_value = kstest(dataset1, dataset2)

print(f"K-S Test Statistic: {statistic:.4f}")
print(f"P-value: {p_value:.4f}")

# Set a significance level (alpha)
alpha = 0.05
print(f"\nSignificance Level (alpha): {alpha}")

# Interpret the results
if p_value < alpha:
    print(f"Since p-value ({p_value:.4f}) < alpha ({alpha}), we reject the null hypothesis.")
    print("Conclusion: There is a statistically significant difference between the distributions of Dataset 1 and Dataset 2.")
    print("This indicates that data drift has likely occurred.")
else:
    print(f"Since p-value ({p_value:.4f}) >= alpha ({alpha}), we fail to reject the null hypothesis.")
    print("Conclusion: There is no statistically significant difference between the distributions of Dataset 1 and Dataset 2.")
    print("This suggests that data drift has not occurred (or is not detectable at this significance level).")

print("\nData drift detection using the Kolmogorov-Smirnov test complete.")
print("This statistical test helps quantify the difference between two data distributions, indicating potential data drift.")


# --- New Section: Implementing Adversarial Validation for Data Drift ---
print("\n\n--- Implementing Adversarial Validation for Data Drift ---")

print("Generating two synthetic datasets (train and test) with a subtle difference for adversarial validation...")

# Simulate 'training' dataset
np.random.seed(1) # for reproducibility
X_train_adv = pd.DataFrame({
    'feature_A': np.random.normal(loc=10, scale=2, size=500),
    'feature_B': np.random.normal(loc=50, scale=5, size=500)
})

# Simulate 'test' dataset with a slight shift in distribution for one feature
np.random.seed(2) # different seed for test data
X_test_adv = pd.DataFrame({
    'feature_A': np.random.normal(loc=10.5, scale=2, size=500), # Slightly shifted mean
    'feature_B': np.random.normal(loc=50, scale=5, size=500)
})

print("\nFirst 5 rows of simulated training data:")
print(X_train_adv.head())
print("\nFirst 5 rows of simulated test data:")
print(X_test_adv.head())

print("\n--- Preparing Data for Adversarial Classifier ---")

# Create a target variable indicating the origin of the data
# 0 for training data, 1 for test data
y_train_adv = pd.Series(0, index=X_train_adv.index)
y_test_adv = pd.Series(1, index=X_test_adv.index)

# Concatenate the datasets
X_combined = pd.concat([X_train_adv, X_test_adv], ignore_index=True)
y_combined = pd.concat([y_train_adv, y_test_adv], ignore_index=True)

# Split the combined dataset for training the adversarial classifier
# This split is for the adversarial validation process itself, not the original model training.
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
    X_combined, y_combined, test_size=0.3, random_state=42, stratify=y_combined
)

print(f"\nCombined data shape: {X_combined.shape}")
print(f"Adversarial classifier training data shape: {X_adv_train.shape}")
print(f"Adversarial classifier test data shape: {X_adv_test.shape}")
print(f"Adversarial classifier target distribution in training set:\n{y_adv_train.value_counts()}")


print("\n--- Training Adversarial Classifier ---")

# Create a pipeline for the adversarial classifier: scale features and then use Logistic Regression
# A simple classifier is usually sufficient for adversarial validation.
adversarial_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

print("\nAdversarial pipeline created:")
print(adversarial_pipeline)

# Train the adversarial classifier
print("\nTraining the adversarial classifier...")
adversarial_pipeline.fit(X_adv_train, y_adv_train)
print("Adversarial classifier training complete.")

print("\n--- Evaluating Adversarial Classifier Performance ---")

# Predict probabilities on the adversarial test set
y_adv_pred_proba = adversarial_pipeline.predict_proba(X_adv_test)[:, 1]
y_adv_pred = adversarial_pipeline.predict(X_adv_test)

# Calculate AUC-ROC score
auc_score = roc_auc_score(y_adv_test, y_adv_pred_proba)
accuracy = accuracy_score(y_adv_test, y_adv_pred)

print(f"\nAdversarial Classifier Accuracy: {accuracy:.4f}")
print(f"Adversarial Classifier AUC-ROC Score: {auc_score:.4f}")

print("\n--- Interpreting Adversarial Validation Results ---")
print("If the AUC-ROC score is significantly higher than 0.5 (e.g., closer to 1.0),")
print("it means the classifier can effectively distinguish between the training and test datasets.")
print("A high AUC-ROC score indicates that there is a detectable data drift between the datasets.")
print("Conversely, an AUC-ROC score close to 0.5 suggests no significant drift, as the classifier performs no better than random guessing.")

if auc_score > 0.7: # A common heuristic for indicating significant drift
    print(f"\nConclusion: With an AUC-ROC score of {auc_score:.4f}, there is strong evidence of data drift.")
    print("This suggests that the distribution of features in the 'test' dataset is different from the 'training' dataset.")
else:
    print(f"Since an AUC-ROC score of {auc_score:.4f} is not significantly higher than 0.5, there is no strong evidence of data drift.")
    print("The distributions of features in the 'test' and 'training' datasets appear similar.")

print("\nAdversarial validation for data drift detection complete.")
print("This technique provides a powerful way to detect subtle shifts in data distributions that might impact model performance.")


# --- New Section: Using SHAP for Feature Drift Analysis ---
print("\n\n--- Using SHAP for Feature Drift Analysis ---")

print("Simulating two datasets with a feature drift for SHAP analysis...")

# Simulate Dataset 1 (Time Period 1)
np.random.seed(100) # for reproducibility
n_samples_shap = 500
X_time1 = pd.DataFrame({
    'Feature_A': np.random.normal(loc=10, scale=2, size=n_samples_shap),
    'Feature_B': np.random.normal(loc=50, scale=5, size=n_samples_shap),
    'Feature_C': np.random.normal(loc=100, scale=10, size=n_samples_shap)
})
# Target variable for Time 1: Feature_A is highly influential
y_time1 = ((X_time1['Feature_A'] * 0.8 + X_time1['Feature_B'] * 0.1 + X_time1['Feature_C'] * 0.05) > 10).astype(int)

# Simulate Dataset 2 (Time Period 2) - Introduce feature drift in Feature_C's influence
np.random.seed(101) # different seed for time 2
X_time2 = pd.DataFrame({
    'Feature_A': np.random.normal(loc=10, scale=2, size=n_samples_shap),
    'Feature_B': np.random.normal(loc=50, scale=5, size=n_samples_shap),
    'Feature_C': np.random.normal(loc=105, scale=12, size=n_samples_shap) # Slight distribution shift
})
# Target variable for Time 2: Feature_C's influence has changed (e.g., increased)
y_time2 = ((X_time2['Feature_A'] * 0.7 + X_time2['Feature_B'] * 0.1 + X_time2['Feature_C'] * 0.2) > 10).astype(int) # Increased coefficient for Feature_C


print("\nDataset 1 (Time Period 1) - First 5 rows:")
print(X_time1.head())
print("\nDataset 2 (Time Period 2) - First 5 rows:")
print(X_time2.head())

print("\n--- Training Models for Each Time Period ---")

# Train a RandomForestClassifier on Dataset 1
model_time1 = RandomForestClassifier(random_state=42, n_estimators=50)
model_time1.fit(X_time1, y_time1)
print("Model trained on Dataset 1 (Time Period 1).")

# Train a RandomForestClassifier on Dataset 2
model_time2 = RandomForestClassifier(random_state=42, n_estimators=50)
model_time2.fit(X_time2, y_time2)
print("Model trained on Dataset 2 (Time Period 2).")

print("\n--- Calculating SHAP Values for Each Model ---")

# Create SHAP explainers for each model
# For tree-based models, shap.TreeExplainer is efficient
explainer_time1 = shap.TreeExplainer(model_time1)
explainer_time2 = shap.TreeExplainer(model_time2)

# Calculate SHAP values for each dataset
# We'll use a subset of the data for explanation for faster computation, or the full dataset if small.
# For demonstration, let's use the full datasets.
shap_values_time1 = explainer_time1.shap_values(X_time1)
shap_values_time2 = explainer_time2.shap_values(X_time2)

# SHAP values for binary classification models often return two arrays (for class 0 and class 1).
# We are interested in the impact on the positive class (class 1).
if isinstance(shap_values_time1, list):
    shap_values_time1 = shap_values_time1[1] # Take SHAP values for the positive class
if isinstance(shap_values_time2, list):
    shap_values_time2 = shap_values_time2[1] # Take SHAP values for the positive class

print(f"SHAP values calculated for Dataset 1 (shape: {shap_values_time1.shape}).")
print(f"SHAP values calculated for Dataset 2 (shape: {shap_values_time2.shape}).")

print("\n--- Analyzing Feature Importance Changes (Feature Drift) using SHAP ---")

# Calculate mean absolute SHAP values for each feature in each time period
# This represents the average magnitude of impact each feature has on the model's output.
mean_abs_shap_time1 = np.abs(shap_values_time1).mean(axis=0)
mean_abs_shap_time2 = np.abs(shap_values_time2).mean(axis=0)

# Create a DataFrame to compare feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_time1.columns,
    'SHAP_Importance_Time1': mean_abs_shap_time1,
    'SHAP_Importance_Time2': mean_abs_shap_time2
})
feature_importance_df['Change'] = feature_importance_df['SHAP_Importance_Time2'] - feature_importance_df['SHAP_Importance_Time1']
feature_importance_df['Percentage_Change'] = (feature_importance_df['Change'] / feature_importance_df['SHAP_Importance_Time1']) * 100
feature_importance_df = feature_importance_df.sort_values(by='SHAP_Importance_Time1', ascending=False)

print("\nComparison of Feature Importances (Mean Absolute SHAP Values):")
print(feature_importance_df.round(4))

# --- Visualize Feature Importance Changes ---
plt.figure(figsize=(12, 7))
bar_width = 0.35
index = np.arange(len(feature_importance_df['Feature']))

plt.bar(index, feature_importance_df['SHAP_Importance_Time1'], bar_width, label='Time Period 1', color='skyblue')
plt.bar(index + bar_width, feature_importance_df['SHAP_Importance_Time2'], bar_width, label='Time Period 2', color='lightcoral')

plt.xlabel('Feature', fontsize=12)
plt.ylabel('Mean Absolute SHAP Value', fontsize=12)
plt.title('Feature Importance Comparison Across Time Periods (Indicating Feature Drift)', fontsize=16)
plt.xticks(index + bar_width / 2, feature_importance_df['Feature'], rotation=45, ha='right', fontsize=10)
plt.legend(fontsize=10)
plt.grid(axis='y', linestyle=':', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nSHAP-based feature drift analysis complete.")
print("The comparison of mean absolute SHAP values and the visualization highlight how the importance of 'Feature_C' has changed between Time Period 1 and Time Period 2, indicating feature drift.")
print("This technique is valuable for understanding which features are contributing differently to model predictions over time, signaling a change in data characteristics.")


# --- New Section: Detect Schema Mismatches in Data Pipelines ---
print("\n\n--- Detect Schema Mismatches in Data Pipelines ---")

print("Simulating source and target DataFrames with potential schema mismatches...")

# 1. Load the source DataFrame with the specified schema
source_data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [24, 30, 22, 35]
}
source_df = pd.DataFrame(source_data)
print("\nSource DataFrame:")
print(source_df)

# 2. Load the target DataFrame with the specified schema
target_data = {
    'id': [1, 2, 3, 4],
    'fullname': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'David Lee'],
    'age': [24, 30, 22, 35]
}
target_df = pd.DataFrame(target_data)
print("\nTarget DataFrame (before resolving mismatch):")
print(target_df)

# 3. Use a simple function to detect mismatches in column names
def detect_column_mismatch(df1, df2):
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)

    mismatched_in_df1 = list(columns1 - columns2)
    mismatched_in_df2 = list(columns2 - columns1)
    common_columns = list(columns1.intersection(columns2))

    if mismatched_in_df1 or mismatched_in_df2:
        print("\n--- Schema Mismatch Detected! ---")
        if mismatched_in_df1:
            print(f"Columns in Source but not in Target: {mismatched_in_df1}")
        if mismatched_in_df2:
            print(f"Columns in Target but not in Source: {mismatched_in_df2}")
        print(f"Common columns: {common_columns}")
        return True
    else:
        print("\nNo column name mismatches detected.")
        return False

# Detect mismatch
mismatch_found = detect_column_mismatch(source_df, target_df)

# 4. Resolve the mismatch by renaming the `fullname` column in the target DataFrame to `name`
if mismatch_found:
    print("\n--- Resolving Mismatch: Renaming 'fullname' to 'name' in Target DataFrame ---")
    if 'fullname' in target_df.columns:
        target_df.rename(columns={'fullname': 'name'}, inplace=True)
        print("Column 'fullname' in Target DataFrame renamed to 'name'.")
    else:
        print("Column 'fullname' not found in Target DataFrame to rename.")

    print("\nTarget DataFrame (after resolving mismatch):")
    print(target_df)

    # Verify after resolution
    print("\n--- Verifying Schema After Resolution ---")
    detect_column_mismatch(source_df, target_df)
else:
    print("\nNo mismatch to resolve.")

print("\nSchema mismatch detection and resolution complete. This helps ensure data consistency across pipelines.")


# --- New Section: Detect Data Drift in ML Models: Categorical Feature Drift ---
print("\n\n--- Detect Data Drift in ML Models: Categorical Feature Drift ---")

print("Simulating baseline and current datasets with a categorical feature for drift detection...")

# 1. Load the baseline distribution for a categorical feature (e.g., gender)
# Simulate a baseline distribution for 'gender' where 'Male' is more frequent
np.random.seed(20) # for reproducibility
baseline_gender_data = np.random.choice(['Male', 'Female', 'Other'], size=1000, p=[0.6, 0.35, 0.05])
df_baseline = pd.DataFrame({'gender': baseline_gender_data})

print("\nBaseline Gender Distribution:")
print(df_baseline['gender'].value_counts(normalize=True).round(2))

# 2. Load the same feature from your current production data
# Simulate a current distribution for 'gender' where 'Female' frequency has increased
np.random.seed(21) # different seed for current data
current_gender_data = np.random.choice(['Male', 'Female', 'Other'], size=1000, p=[0.45, 0.5, 0.05])
df_current = pd.DataFrame({'gender': current_gender_data})

print("\nCurrent Gender Distribution:")
print(df_current['gender'].value_counts(normalize=True).round(2))

print("\n--- Visualizing Categorical Feature Distributions ---")
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

sns.countplot(x='gender', data=df_baseline, ax=axes[0], palette='pastel')
axes[0].set_title('Baseline Gender Distribution', fontsize=14)
axes[0].set_xlabel('Gender', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(axis='y', linestyle=':', alpha=0.7)

sns.countplot(x='gender', data=df_current, ax=axes[1], palette='pastel')
axes[1].set_title('Current Gender Distribution', fontsize=14)
axes[1].set_xlabel('Gender', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].grid(axis='y', linestyle=':', alpha=0.7)

plt.tight_layout()
plt.show()

print("\n--- Using Chi-squared Test to Compare Distributions ---")
# Create a contingency table (observed frequencies)
# This table shows the counts of each gender category in both baseline and current data.
contingency_table = pd.DataFrame({
    'Baseline': df_baseline['gender'].value_counts().reindex(['Male', 'Female', 'Other'], fill_value=0),
    'Current': df_current['gender'].value_counts().reindex(['Male', 'Female', 'Other'], fill_value=0)
})

print("\nContingency Table:")
print(contingency_table)

# Perform the Chi-squared test
# H0 (Null Hypothesis): The two distributions are independent (i.e., no significant drift).
# H1 (Alternative Hypothesis): The two distributions are dependent (i.e., significant drift).
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of Freedom: {dof}")
# print("\nExpected Frequencies (if no drift):")
# print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns).round(2))

# Set a significance level (alpha)
alpha = 0.05
print(f"\nSignificance Level (alpha): {alpha}")

# Interpret the results
if p_value < alpha:
    print(f"Since p-value ({p_value:.4f}) < alpha ({alpha}), we reject the null hypothesis.")
    print("Conclusion: There is a statistically significant difference between the baseline and current categorical feature distributions.")
    print("This indicates that data drift has likely occurred in the 'gender' feature.")
else:
    print(f"Since p-value ({p_value:.4f}) >= alpha ({alpha}), we fail to reject the null hypothesis.")
    print("Conclusion: There is no statistically significant difference between the baseline and current categorical feature distributions.")
    print("This suggests that data drift has not occurred in the 'gender' feature (or is not detectable at this significance level).")

print("\nCategorical feature drift detection using the Chi-squared test complete.")
print("This method helps identify shifts in the distribution of categorical variables over time.")


# --- New Section: Detect Data Drift in ML Models: Feature Correlation Drift ---
print("\n\n--- Detect Data Drift in ML Models: Feature Correlation Drift ---")

print("Simulating baseline and current datasets with changing feature correlations...")

# 1. Simulate the baseline dataset (training data)
np.random.seed(30) # for reproducibility
n_samples_corr = 500
baseline_data_corr = pd.DataFrame({
    'Feature_X': np.random.normal(loc=0, scale=1, size=n_samples_corr),
    'Feature_Y': np.random.normal(loc=0, scale=1, size=n_samples_corr),
    'Feature_Z': np.random.normal(loc=0, scale=1, size=n_samples_corr)
})
# Introduce a strong positive correlation between X and Y in baseline
baseline_data_corr['Feature_Y'] = baseline_data_corr['Feature_X'] * 0.7 + np.random.normal(loc=0, scale=0.5, size=n_samples_corr)

# 2. Simulate the current dataset (production data)
np.random.seed(31) # different seed for current data
current_data_corr = pd.DataFrame({
    'Feature_X': np.random.normal(loc=0, scale=1, size=n_samples_corr),
    'Feature_Y': np.random.normal(loc=0, scale=1, size=n_samples_corr),
    'Feature_Z': np.random.normal(loc=0, scale=1, size=n_samples_corr)
})
# Introduce a weaker or negative correlation between X and Y in current data
current_data_corr['Feature_Y'] = baseline_data_corr['Feature_X'] * 0.2 + np.random.normal(loc=0, scale=0.8, size=n_samples_corr)


print("\nBaseline Data (first 5 rows):")
print(baseline_data_corr.head())
print("\nCurrent Data (first 5 rows):")
print(current_data_corr.head())

print("\n--- Computing Correlation Matrices ---")

# Compute correlation matrix for the baseline dataset
correlation_matrix_baseline = baseline_data_corr.corr()
print("\nCorrelation Matrix - Baseline Data:")
print(correlation_matrix_baseline.round(2))

# Compute correlation matrix for the current dataset
correlation_matrix_current = current_data_corr.corr()
print("\nCorrelation Matrix - Current Data:")
print(correlation_matrix_current.round(2))

print("\n--- Assessing Changes in Correlation Matrix ---")

# Calculate the absolute difference between the two correlation matrices
correlation_difference = np.abs(correlation_matrix_baseline - correlation_matrix_current)
print("\nAbsolute Difference in Correlation Matrices (Current - Baseline):")
print(correlation_difference.round(2))

# Visualize the correlation matrices as heatmaps
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.heatmap(correlation_matrix_baseline, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, ax=axes[0])
axes[0].set_title('Baseline Correlation Matrix', fontsize=14)

sns.heatmap(correlation_matrix_current, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, ax=axes[1])
axes[1].set_title('Current Correlation Matrix', fontsize=14)

sns.heatmap(correlation_difference, annot=True, cmap='Reds', fmt=".2f", linewidths=.5, ax=axes[2])
axes[2].set_title('Absolute Difference in Correlations', fontsize=14)

plt.tight_layout()
plt.show()

print("\n--- Interpreting Feature Correlation Drift ---")
print("Significant changes in the correlation matrix, especially large values in the 'Absolute Difference' heatmap, indicate feature correlation drift.")
print("For example, observe the correlation between 'Feature_X' and 'Feature_Y'. In the baseline, it was around 0.70, but in the current data, it dropped to around 0.20.")
print("This change is clearly visible in the 'Absolute Difference' matrix, showing a large value (around 0.50) for this pair.")
print("Such shifts can impact models that rely on feature relationships (e.g., linear models, PCA).")
print("Investigating these changes is crucial to understand if there are issues in data collection, new underlying patterns, or if the model assumptions are no longer valid.")

print("\nFeature correlation drift detection complete.")


# --- New Section: Task 1: Handling Missing Values - Simple Imputation ---
print("\n\n--- Task 1: Handling Missing Values - Simple Imputation ---")

print("Simulating a DataFrame with missing values for imputation demonstration...")

# Create a DataFrame with missing values
np.random.seed(40)
data_imputation = {
    'numerical_feature_1': np.random.normal(loc=100, scale=10, size=20),
    'numerical_feature_2': np.random.randint(1, 100, size=20).astype(float),
    'categorical_feature': np.random.choice(['A', 'B', 'C'], size=20)
}
df_imputation = pd.DataFrame(data_imputation)

# Introduce some missing values
df_imputation.loc[[2, 5, 10], 'numerical_feature_1'] = np.nan
df_imputation.loc[[7, 15], 'numerical_feature_2'] = np.nan
df_imputation.loc[[3, 12, 18], 'categorical_feature'] = np.nan

print("\nOriginal DataFrame with missing values:")
print(df_imputation)
print("\nMissing values count per column:")
print(df_imputation.isnull().sum())

print("\n--- Performing Simple Imputation ---")

# Impute numerical features with the mean
numerical_cols = df_imputation.select_dtypes(include=np.number).columns
for col in numerical_cols:
    mean_val = df_imputation[col].mean()
    df_imputation[col].fillna(mean_val, inplace=True)
    print(f"Imputed numerical column '{col}' with its mean: {mean_val:.2f}")

# Impute categorical features with the mode
categorical_cols = df_imputation.select_dtypes(include='object').columns
for col in categorical_cols:
    # .mode()[0] is used because .mode() can return multiple modes if they have the same frequency
    mode_val = df_imputation[col].mode()[0]
    df_imputation[col].fillna(mode_val, inplace=True)
    print(f"Imputed categorical column '{col}' with its mode: {mode_val}")

print("\nDataFrame after simple imputation:")
print(df_imputation)
print("\nMissing values count after imputation:")
print(df_imputation.isnull().sum())

print("\nSimple imputation (mean for numerical, mode for categorical) complete.")


# --- New Section: Task 2: Feature Scaling - Min-Max Normalization ---
print("\n\n--- Task 2: Feature Scaling - Min-Max Normalization ---")

print("Simulating a numerical feature for Min-Max Normalization...")

# Simulate a numerical feature
np.random.seed(41)
feature_to_normalize = np.random.randint(50, 500, size=30).reshape(-1, 1) # Reshape for MinMaxScaler
df_minmax = pd.DataFrame(feature_to_normalize, columns=['Original_Feature'])

print("\nOriginal Feature (first 5 rows):")
print(df_minmax.head())
print(f"Original Min: {df_minmax['Original_Feature'].min()}, Max: {df_minmax['Original_Feature'].max()}")

print("\n--- Applying Min-Max Normalization ---")

# Initialize MinMaxScaler to scale to [0, 1]
min_max_scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the feature
normalized_feature = min_max_scaler.fit_transform(df_minmax[['Original_Feature']])
df_minmax['Normalized_Feature'] = normalized_feature

print("\nFeature after Min-Max Normalization (first 5 rows):")
print(df_minmax.head())
print(f"Normalized Min: {df_minmax['Normalized_Feature'].min():.2f}, Max: {df_minmax['Normalized_Feature'].max():.2f}")

print("\nMin-Max Normalization complete. The feature is now scaled to the range [0, 1].")


# --- New Section: Task 3: Handling Missing Values - Drop Missing Values ---
print("\n\n--- Task 3: Handling Missing Values - Drop Missing Values ---")

print("Simulating a DataFrame with missing values for dropping demonstration...")

# Create a DataFrame with missing values
np.random.seed(42)
data_drop = {
    'col_A': [1, 2, np.nan, 4, 5],
    'col_B': [6, np.nan, 8, 9, 10],
    'col_C': [11, 12, 13, np.nan, 15]
}
df_drop = pd.DataFrame(data_drop)

print("\nOriginal DataFrame with missing values:")
print(df_drop)
print("\nMissing values count per column:")
print(df_drop.isnull().sum())
print(f"Original DataFrame shape: {df_drop.shape}")

print("\n--- Removing Rows with Missing Values ---")

# Drop rows that contain any missing values
df_dropped = df_drop.dropna()

print("\nDataFrame after dropping rows with missing values:")
print(df_dropped)
print(f"DataFrame shape after dropping missing values: {df_dropped.shape}")
print("\nMissing values count after dropping:")
print(df_dropped.isnull().sum())

print("\nDropping missing values complete. Rows containing any NaN values have been removed.")


# --- New Section: Task 4: Feature Scaling - Standardization ---
print("\n\n--- Task 4: Feature Scaling - Standardization ---")

print("Simulating a numerical feature for Standardization...")

# Simulate a numerical feature
np.random.seed(43)
feature_to_standardize = np.random.normal(loc=150, scale=25, size=30).reshape(-1, 1) # Reshape for StandardScaler
df_standard = pd.DataFrame(feature_to_standardize, columns=['Original_Feature'])

print("\nOriginal Feature (first 5 rows):")
print(df_standard.head())
print(f"Original Mean: {df_standard['Original_Feature'].mean():.2f}, Std Dev: {df_standard['Original_Feature'].std():.2f}")

print("\n--- Applying Standardization ---")

# Initialize StandardScaler
standard_scaler = StandardScaler()

# Fit and transform the feature
standardized_feature = standard_scaler.fit_transform(df_standard[['Original_Feature']])
df_standard['Standardized_Feature'] = standardized_feature

print("\nFeature after Standardization (first 5 rows):")
print(df_standard.head())
print(f"Standardized Mean: {df_standard['Standardized_Feature'].mean():.2f}, Std Dev: {df_standard['Standardized_Feature'].std():.2f}")

print("\nStandardization complete. The feature now has a mean of approximately 0 and a standard deviation of approximately 1.")


ModuleNotFoundError: No module named 'nltk'