In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import json
import sys
import joblib
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
!git config --global user.name "Somsubhra"
!git config --global user.email "somsubhrade.04@gmail.com"

In [3]:
GITHUB_USERNAME = "somsubhra04"
GITHUB_PAT = "" #hidden
GIT_URL_WITH_PAT = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/iris-data-poisoning-wk8.git"

In [5]:
!git clone https://{GITHUB_PAT}@github.com/somsubhra04/iris-data-poisoning-wk8.git
os.chdir('iris-data-poisoning-wk8')
print(f"Current working directory: {os.getcwd()}")

Cloning into 'iris-data-poisoning-wk8'...
Current working directory: /home/jupyter/iris-data-poisoning-wk8


In [7]:
DATA_VERSION = "v1"
DATA_FILE_NAME = f'../week1/data/iris{DATA_VERSION}.csv'

try:
    df = pd.read_csv(DATA_FILE_NAME)
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_FILE_NAME}. Please check the path.")
    raise

In [8]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.8,4.0,1.2,0.2,setosa
1,5.7,4.4,1.5,0.4,setosa
2,5.4,3.9,1.3,0.4,setosa
3,5.1,3.5,1.4,0.3,setosa
4,5.7,3.8,1.7,0.3,setosa
...,...,...,...,...,...
96,6.7,3.0,5.2,2.3,virginica
97,6.3,2.5,5.0,1.9,virginica
98,6.5,3.0,5.2,2.0,virginica
99,6.2,3.4,5.4,2.3,virginica


In [9]:
# Separating features (X) and target (y)
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y_species = df['species']

# Encoding the categorical 'species' column into numerical target labels (0, 1, 2)
le = LabelEncoder()
y = pd.Series(le.fit_transform(y_species))

# getting feature statistics for generating random poison data
X_min = X.min().values
X_max = X.max().values
n_classes = len(np.unique(y))

# Splitting data: Training set will be poisoned, Test set remains clean for validation
X_clean_train, X_test, y_clean_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data successfully loaded from: {DATA_FILE_NAME}")
print(f"Clean Training Set Size: {len(X_clean_train)}")
print(f"Clean Test Set Size: {len(X_test)}")

Data successfully loaded from: ../week1/data/irisv1.csv
Clean Training Set Size: 80
Clean Test Set Size: 21


In [10]:
# --- Data Poisoning Function ---
def generate_poisoned_data(X_base, y_base, poisoning_level):
    """
    Generates non-targeted, availability-style poisoned data.
    The poisoning method injects synthetic, randomly-labeled samples.
    """
    N_base = len(X_base)
    N_poison = int(N_base * poisoning_level / (1 - poisoning_level))
    
    if N_poison == 0:
        return X_base, y_base, 0 # Return clean data if poisoning size is zero

    # Generating synthetic features: random floats within the min/max range
    X_poison_list = []
    for i in range(len(X_base.columns)):
        # np.random.uniform(low, high, size)
        features = np.random.uniform(X_min[i], X_max[i], N_poison)
        X_poison_list.append(features)
        
    X_poison = pd.DataFrame(np.column_stack(X_poison_list), columns=X_base.columns)
    
    # Generating random labels for maximum confusion (non-targeted attack)
    y_poison = pd.Series(np.random.randint(0, n_classes, N_poison))
    
    # Concatenating clean training data and poisoned data
    X_poisoned = pd.concat([X_base, X_poison], ignore_index=True)
    y_poisoned = pd.concat([y_base, y_poison], ignore_index=True)
    
    # Sanity check: Ensure the total size is correct
    actual_poisoning_percentage = N_poison / len(X_poisoned)
    
    return X_poisoned, y_poisoned, actual_poisoning_percentage

In [11]:
# --- MLflow Experiment Loop ---
mlflow.set_experiment("IRIS-Data-Poisoning-Attack")

# Poisoning levels to test (Target percentage of the final training dataset)
POISONING_LEVELS = [0.0, 0.05, 0.10, 0.50]

for level in POISONING_LEVELS:
    run_name = f"{int(level*100)}Pct_Poisoning"
    
    with mlflow.start_run(run_name=run_name) as run:
        
        # 1. Generating Poisoned Data
        X_train_p, y_train_p, actual_p = generate_poisoned_data(
            X_clean_train, y_clean_train, level
        )
        
        # 2. Logging Parameters
        mlflow.log_param("poisoning_level_target", level)
        mlflow.log_param("actual_poisoning_ratio", f"{actual_p:.4f}")
        mlflow.log_param("training_set_size", len(X_train_p))
        mlflow.log_param("model_type", "LogisticRegression")
        
        # 3. Training Model
        model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200, random_state=42)
        model.fit(X_train_p, y_train_p)
        
        # 4. Evaluating on Clean Test Data
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # 5. Logging Metrics and Model
        mlflow.log_metric("validation_accuracy", accuracy)
        
        # Logging the model (artifact)
        mlflow.sklearn.log_model(model, "model")
        
        print(f"Run '{run_name}' completed. Validation Accuracy: {accuracy:.4f}")

print("\nAll experiments completed. Check the 'mlruns' directory for data.")

2025/11/10 09:24:42 INFO mlflow.tracking.fluent: Experiment with name 'IRIS-Data-Poisoning-Attack' does not exist. Creating a new experiment.


Run '0Pct_Poisoning' completed. Validation Accuracy: 0.9524




Run '5Pct_Poisoning' completed. Validation Accuracy: 0.9524




Run '10Pct_Poisoning' completed. Validation Accuracy: 0.9524




Run '50Pct_Poisoning' completed. Validation Accuracy: 0.9048

All experiments completed. Check the 'mlruns' directory for data.


In [13]:
print("\n--- Pushing to GitHub ---")
!git add .
!git commit -m "Logged four MLflow runs for 0, 5, 10, and 50 percent data poisoning."
!git branch -M main
!git push -u {GIT_URL_WITH_PAT} main


--- Pushing to GitHub ---
On branch master
Your branch is based on 'origin/master', but the upstream is gone.
  (use "git branch --unset-upstream" to fixup)

nothing to commit, working tree clean
Enumerating objects: 96, done.
Counting objects: 100% (96/96), done.
Delta compression using up to 2 threads
Compressing objects: 100% (63/63), done.
Writing objects: 100% (96/96), 10.13 KiB | 211.00 KiB/s, done.
Total 96 (delta 14), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (14/14), done.[K
To https://github.com/somsubhra04/iris-data-poisoning-wk8.git
 * [new branch]      main -> main
Branch 'main' set up to track remote branch 'main' from 'https://somsubhra04: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@github.com/somsubhra04/iris-data-poisoning-wk8.git'.


In [14]:
# Come back to the repository root
os.chdir('..')
print(f"Current working directory: {os.getcwd()}")

Current working directory: /home/jupyter


In [None]:
print("\n--- Pushing to GitHub ---")
!git add wk8.ipynb
!git commit -m "pushed the notebook"
!git branch -M main
!git push -u {GIT_URL_WITH_PAT} main