In [9]:
import pandas as pd
import numpy as np
import re
import joblib
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [10]:
model = joblib.load("../models/baseline_model.pkl")
vectorizer = joblib.load("../models/vectorizer.pkl")

In [11]:
dataset = load_dataset("imdb")
test_data = pd.DataFrame(dataset["test"])
test_data = test_data.sample(1000, random_state=42)

In [37]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [51]:
x_test_clean = test_data["text"].apply(clean_text)
x_test_vec = vectorizer.transform(x_test_clean)
y_true = test_data["label"]
y_pred = model.predict(x_test_vec)
baseline_accuracy = accuracy_score(y_true, y_pred)
print("Baseline Accuracy:", baseline_accuracy)

Baseline Accuracy: 0.857


In [39]:
results = []

In [40]:
x_test_raw = test_data["text"]
x_test_vec_raw = vectorizer.transform(x_test_raw)
y_pred_raw = model.predict(x_test_vec_raw)
accuracy_raw = accuracy_score(y_true, y_pred_raw)
print("Accuracy without preprocessing:", accuracy_raw)

results.append({
    "Experiment": "No preprocessing",
    "Accuracy": accuracy_raw}
)

Accuracy without preprocessing: 0.858


In [41]:
def introduce_noise(text):
    text = text.replace("movie", "movi")
    text = text.replace("good", "gud")
    text = text.replace("film", "flim")
    return text

x_test_drift = x_test_clean.apply(introduce_noise)
x_test_vec_drift = vectorizer.transform(x_test_drift)
y_pred_drift = model.predict(x_test_vec_drift)
accuracy_drift = accuracy_score(y_true, y_pred_drift)
print("Accuracy with drift:", accuracy_drift)
results.append({
    "Experiment": "Data drift",
    "Accuracy": accuracy_drift
})


Accuracy with drift: 0.86


In [42]:
x_test_missing = x_test_clean.copy()
x_test_missing.iloc[0:50] = ""
x_test_vec_missing = vectorizer.transform(x_test_missing)
y_pred_missing = model.predict(x_test_vec_missing)
accuracy_missing = accuracy_score(y_true, y_pred_missing)
print("Accuracy with missing values:", accuracy_missing)
results.append({
    "Experiment": "Missing values",
    "Accuracy": accuracy_missing
})


Accuracy with missing values: 0.838


In [44]:
results_df = pd.DataFrame(results)
results_df[:]

Unnamed: 0,Experiment,Accuracy
0,No preprocessing,0.858
1,Data drift,0.86
2,Missing values,0.838


In [45]:
results_df.to_csv("../results/experiment_logs.csv", index=False)

In [46]:
def severity(baseline, new_acc):
    drop = baseline - new_acc
    
    if drop < 0.05:
        return "Minor"
    elif drop < 0.15:
        return "Moderate"
    else:
        return "Critical"

In [48]:
results_df["Severity"] = results_df["Accuracy"].apply(
    lambda x: severity(baseline_accuracy, x)
)
results_df[:]

Unnamed: 0,Experiment,Accuracy,Severity
0,No preprocessing,0.858,Minor
1,Data drift,0.86,Minor
2,Missing values,0.838,Minor


In [50]:
results_df.to_csv("../results/experiment_logs.csv", index=False)

ROOT CAUSE ANALYSIS

Explain each failure:

No preprocessing

-Vocabulary mismatch

-Increased unseen tokens

-Sparse vectors degrade predictions

Data drift

-Words outside training distribution

-TF-IDF cannot represent new patterns well

Missing values

-Empty vectors lead to weak or default predictions

MITIGATION STATEGIES
| Failure          | Mitigation                     |
| ---------------- | ------------------------------ |
| No preprocessing | Enforce preprocessing pipeline |
| Data drift       | Monitor distribution           |
| Missing values   | Input validation               |

In [49]:
def validate_input(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral"
    return text

x_test_validated = x_test_missing.apply(validate_input)
x_test_vec_validated = vectorizer.transform(x_test_validated)

y_pred_validated = model.predict(x_test_vec_validated)
accuracy_validated = accuracy_score(y_true, y_pred_validated)

print("Accuracy with validation:", accuracy_validated)

Accuracy with validation: 0.838
