### Dependencies

In [65]:
pip install datasets scikit-learn jenga pandas numpy setuptools -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.text import BrokenCharacters
import os

### Load the data

In [None]:
import pandas as pd
from pathlib import Path
print("Loading Amazon Reviews 2023 (All Beauty)...")

BASE_DIR =  Path.cwd().parent / "datasets"
AMAZON_DIR = os.path.join(BASE_DIR, "amazon_reviews_2023_all_beauty")
os.makedirs(AMAZON_DIR, exist_ok=True)

# Load dataset with streaming=False (local download)
try:
    dataset = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        "raw_review_All_Beauty",
        split="full",
        streaming=False
    )
    
    df = dataset.to_pandas()
    
except Exception as e:
    print(f"Error loading from HF: {e}")


print(f"‚úÖ Loaded {len(df)} reviews")
print(f"Columns: {df.columns.tolist()}")
print(df.head())

# Sample 50k for faster local processing
df_sample = df.sample(n=min(50000, len(df)), random_state=42)
sample_path = os.path.join(AMAZON_DIR, "sample_50k.csv")
df_sample.to_csv(sample_path, index=False)
print(f"‚úÖ Saved sample to: {sample_path}")

Loading Amazon Reviews 2023 (All Beauty)...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


‚úÖ Loaded 701528 reviews
Columns: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']
   rating                                      title  \
0     5.0  Such a lovely scent but not overpowering.   
1     4.0     Works great but smells a little weird.   
2     5.0                                       Yes!   
3     1.0                          Synthetic feeling   
4     5.0                                         A+   

                                                text images        asin  \
0  This spray is really nice. It smells really go...     []  B00YQ6X8EO   
1  This product does what I need it to do, I just...     []  B081TJ8YS3   
2                          Smells good, feels great!     []  B07PNNCSP9   
3                                     Felt synthetic     []  B09JS339BZ   
4                                            Love it     []  B08BZ63GMJ   

  parent_asin                       user_id      timest

In [44]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [56]:
CORRUPT_DIR = os.path.join(AMAZON_DIR, "corrupted_batches")
ARTIFACTS_DIR = os.path.join(BASE_DIR, "artifacts")

os.makedirs(AMAZON_DIR, exist_ok=True)
os.makedirs(CORRUPT_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

In [None]:

dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_All_Beauty",
    split="full",
    streaming=False
)
df = dataset.to_pandas()
print(f"‚úÖ Loaded {len(df)} reviews")

# Sample 15k for faster processing (8k train + 2k test + 5k corrupt)
df = df.sample(n=min(15000, len(df)), random_state=42).reset_index(drop=True)

if "text" not in df.columns:
    df["text"] = (df.get("title", "").fillna("").astype(str) + " " + 
                  df.get("body", "").fillna("").astype(str)).str.strip()

# Clean data
df = df.dropna(subset=["rating", "text"])
df = df[df["text"].str.len() > 0]

df["label"] = df["rating"].astype(int)

print(f"\nüìä Data Summary:")
print(f"   Total samples: {len(df)}")
print(f"   Label distribution: {df['label'].value_counts().to_dict()}")

X = df["text"].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,  
    random_state=42,
    stratify=y
)

print(f"\n‚úÖ Train/Test Split:")
print(f"   Train: {len(X_train)} samples")
print(f"   Test: {len(X_test)} samples")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


‚úÖ Loaded 701528 reviews

üìä Data Summary:
   Total samples: 14998
   Label distribution: {5: 9035, 1: 2183, 4: 1727, 3: 1132, 2: 921}

‚úÖ Train/Test Split:
   Train: 11998 samples
   Test: 3000 samples


### Feature extraction and model

In [58]:
def build_model():
    """Create TF-IDF + LogisticRegression pipeline"""
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            stop_words="english",
            ngram_range=(1, 2),
            min_df=2,
            max_features=5000
        )),
        ("clf", LogisticRegression(max_iter=2000, C=1.0, class_weight="balanced", random_state=42))
    ])

print("\nüîß Training baseline model...")
baseline_model = build_model()
baseline_model.fit(X_train, y_train)

# Evaluate baseline
y_pred_baseline = baseline_model.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred_baseline)

print(f"\n‚úÖ Baseline Model Trained")
print(f"üìä Accuracy: {baseline_acc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_baseline))

# Save baseline accuracy
baseline_metrics = {
    "accuracy": baseline_acc,
    "predictions": y_pred_baseline
}


üîß Training baseline model...

‚úÖ Baseline Model Trained
üìä Accuracy: 0.5633

Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.62      0.57       437
           2       0.16      0.22      0.18       184
           3       0.14      0.21      0.17       226
           4       0.24      0.35      0.29       346
           5       0.87      0.67      0.76      1807

    accuracy                           0.56      3000
   macro avg       0.39      0.41      0.39      3000
weighted avg       0.65      0.56      0.60      3000



### Data Corruption

In [59]:
"""
Define individual corruption methods
"""

def apply_missing_values(df, fraction=0.30):
    """Batch 01: Missing Values in text"""
    df = df.copy()
    mv = MissingValues(column="text", fraction=fraction, missingness="MCAR")
    return mv.transform(df)

def apply_broken_characters(df, fraction=0.25):
    """Batch 02: Broken Characters in text"""
    df = df.copy()
    bc = BrokenCharacters(column="text", fraction=fraction)
    return bc.transform(df)

def apply_swapped_text(df, fraction=0.20):
    """Batch 03: Swapped text values"""
    df = df.copy()
    sv = SwappedValues(column="text", fraction=fraction)
    return sv.transform(df)

def apply_missing_labels(df, fraction=0.15):
    """Batch 04: Missing Labels"""
    df = df.copy()
    mv = MissingValues(column="label", fraction=fraction, missingness="MCAR")
    return mv.transform(df)

def apply_swapped_labels(df, fraction=0.12):
    """Batch 05: Swapped Labels"""
    df = df.copy()
    sv = SwappedValues(column="label", fraction=fraction)
    return sv.transform(df)

def apply_combined_text_corruption(df):
    """Batch 06: Broken Chars (10%) + Missing (8%)"""
    df = df.copy()
    bc = BrokenCharacters(column="text", fraction=0.10)
    df = bc.transform(df)
    mv = MissingValues(column="text", fraction=0.08, missingness="MCAR")
    return mv.transform(df)

def apply_combined_text_labels(df):
    """Batch 07: Swapped Text (15%) + Swapped Labels (8%)"""
    df = df.copy()
    sv_text = SwappedValues(column="text", fraction=0.15)
    df = sv_text.transform(df)
    sv_label = SwappedValues(column="label", fraction=0.08)
    return sv_label.transform(df)

def apply_heavy_missing(df):
    """Batch 08: Heavy Missing - Text (25%) + Labels (10%)"""
    df = df.copy()
    mv_text = MissingValues(column="text", fraction=0.25, missingness="MCAR")
    df = mv_text.transform(df)
    mv_label = MissingValues(column="label", fraction=0.10, missingness="MCAR")
    return mv_label.transform(df)

def apply_all_corruptions(df):
    """Batch 09: All - Broken (8%) + Swapped (10%) + Missing (5%)"""
    df = df.copy()
    bc = BrokenCharacters(column="text", fraction=0.08)
    df = bc.transform(df)
    sv_text = SwappedValues(column="text", fraction=0.10)
    df = sv_text.transform(df)
    mv_text = MissingValues(column="text", fraction=0.05, missingness="MCAR")
    df = mv_text.transform(df)
    sv_label = SwappedValues(column="label", fraction=0.05)
    return sv_label.transform(df)

print("‚úÖ Corruption functions defined")

‚úÖ Corruption functions defined


In [60]:

df_corrupt_source = df.iloc[len(X_train) + len(X_test):][["text", "label"]].copy()
if len(df_corrupt_source) < 5000:
    df_corrupt_source = df.sample(n=5000, random_state=42).reset_index(drop=True)
else:
    df_corrupt_source = df_corrupt_source.sample(n=5000, random_state=42).reset_index(drop=True)

batches_config = [
    ("01_missing", apply_missing_values, {}),
    ("02_broken_chars", apply_broken_characters, {}),
    ("03_swapped", apply_swapped_text, {}),
    ("04_missing_labels", apply_missing_labels, {}),
    ("05_swapped_labels", apply_swapped_labels, {}),
    ("06_combined_text", apply_combined_text_corruption, {}),
    ("07_combined_both", apply_combined_text_labels, {}),
    ("08_heavy_missing", apply_heavy_missing, {}),
    ("09_all_corruptions", apply_all_corruptions, {}),
]

corrupted_batches = {}

for batch_name, corruption_fn, kwargs in batches_config:
    print(f"\nüîß Batch {batch_name.split('_')[0]}: {batch_name.replace('_', ' ').title()}")
    df_batch = corruption_fn(df_corrupt_source, **kwargs)
    batch_path = os.path.join(CORRUPT_DIR, f"batch_{batch_name}.csv")
    df_batch.to_csv(batch_path, index=False)
    corrupted_batches[batch_name] = df_batch
    print(f"‚úÖ {batch_name} generated")

print(f"\n‚úÖ All 9 batches generated in {CORRUPT_DIR}")


üîß Batch 01: 01 Missing
‚úÖ 01_missing generated

üîß Batch 02: 02 Broken Chars
‚úÖ 02_broken_chars generated

üîß Batch 03: 03 Swapped
‚úÖ 03_swapped generated

üîß Batch 04: 04 Missing Labels


 'I love everything about it'
 'Good basic sheet mask that feels cooling and hydrating. Feels nice when my rosacea has made my face feel warm (I have visible veins/redness from rosacea; no other issues). Excellent for use in the bathtub, or for a mini nap, or simply to help relaxation. Might even be perfect post skin treatment sessions that may leave skin feeling irritated.<br /><br />Stocking stuffer!! I love how budget friendly this pack is. I love making little self-care packages to hand out, and while I will buy and use masks in the $5-35 range, I prefer the $1-3 masks for the token give away (think: facial mask, hand cream, lip balm, etc.. tiny cheap self care packages). I also prefer the cheapies only because I am not sure if the recipient will definitely use it, and the higher end masks tend to be more discerning in what it attends to. The more basic masks tend to be good for nearly everyone, and the resulting feeling is at least a fresh, hydrated face. So these are a repurchase

‚úÖ 04_missing_labels generated

üîß Batch 05: 05 Swapped Labels
‚úÖ 05_swapped_labels generated

üîß Batch 06: 06 Combined Text
‚úÖ 06_combined_text generated

üîß Batch 07: 07 Combined Both
‚úÖ 07_combined_both generated

üîß Batch 08: 08 Heavy Missing
‚úÖ 08_heavy_missing generated

üîß Batch 09: 09 All Corruptions
‚úÖ 09_all_corruptions generated

‚úÖ All 9 batches generated in /Users/HP/Data_Preparation_Project_G17_Theme1/datasets/amazon_reviews_2023_all_beauty/corrupted_batches


 'Perfect for sock bun for your hair!'
 'Liked easy removal of hair from brush'
 'When I peel off this mask, I just see the lines of my face in it- no impurities or blackheads as it claims to remove.'
 'I am extremely happy with this Hook Ya sunscreen. I gave this a solid test on a recent three week dive trip to Indonesia and Hawaii. I was in the sun a lot, in and out of the water, but this sunscreen did a perfect job. I burn very easily and I am glad to find a reef-safe product that REALLY works!'
 'Arrived broke and leaking. Useless'
 "G·ªõ·ªõd q√∫√°lity, st√∫rdy, d·ªõ√©sn't l·ªõ·ªõk ch√©√°p.. h√°ppy c√∫st·ªõm√©r!!"
 'The colors Easily break when rolled up'
 'This is √° gr√©√°t id√©√° b√∫t it w·ªõ√∫ld l·ªõ·ªõk b√©tt√©r if th√© sc√°rf p√°rt w√°s √° littl√© l·ªõw√©r ·ªõr th√© h√°ir w√°s √° littl√© l·ªõng√©r'
 'Great product at a great price.'
 'Very beautifully crafted.  It will be used for my daughter‚Äôs bridal shower.'
 'Very nice masks.. Bought it for my wife.. She really liked it.

### Data cleaning

In [61]:
def clean_corrupted_data(df):
    """
    Clean corrupted dataframe for model training
    
    Parameters:
    -----------
    df : pd.DataFrame
        Corrupted dataframe with 'text' and 'label' columns
    
    Returns:
    --------
    pd.DataFrame
        Cleaned dataframe ready for training
    """
    df_clean = df.copy()
    original_size = len(df_clean)
    
    # Step 1: Remove NaN rows
    df_clean = df_clean.dropna(subset=["text", "label"])
    after_nan_removal = len(df_clean)
    
    # Step 2: Convert text to string and remove empty strings
    df_clean["text"] = df_clean["text"].astype(str)
    df_clean = df_clean[df_clean["text"] != "nan"]  # Remove "nan" strings
    df_clean = df_clean[df_clean["text"].str.len() > 0]  # Remove empty strings
    after_text_clean = len(df_clean)
    
    # Step 3: Convert labels to numeric (invalid labels ‚Üí NaN)
    df_clean["label"] = pd.to_numeric(df_clean["label"], errors="coerce")
    after_numeric = len(df_clean)
    
    # Step 4: Remove rows with NaN labels
    df_clean = df_clean.dropna(subset=["label"])
    after_label_removal = len(df_clean)
    
    # Step 5: Convert to integer and validate range [1-5]
    df_clean["label"] = df_clean["label"].astype(int)
    df_clean = df_clean[df_clean["label"].between(1, 5)]
    final_size = len(df_clean)
    
    # Print cleaning summary
    print(f"\n   üßπ Data Cleaning Summary:")
    print(f"      Original samples: {original_size}")
    print(f"      After NaN removal: {after_nan_removal} (-{original_size - after_nan_removal})")
    print(f"      After text cleaning: {after_text_clean} (-{after_nan_removal - after_text_clean})")
    print(f"      After numeric conversion: {after_numeric}")
    print(f"      After bad label removal: {after_label_removal} (-{after_numeric - after_label_removal} bad labels)")
    print(f"      After range validation: {final_size} (-{after_label_removal - final_size})")
    print(f"      ‚ö†Ô∏è  Total removed: {original_size - final_size} ({(original_size - final_size) / original_size * 100:.1f}%)")
    
    return df_clean

print("‚úÖ Data cleaning function defined")

‚úÖ Data cleaning function defined


### Retraining and evaluation

In [62]:
"""
Refit baseline model on each corrupted batch and evaluate accuracy
"""
print("\n" + "="*70)
print("CELL 4: EVALUATE MODEL ON CORRUPTED BATCHES")
print("="*70)

# Store results
corruption_results = {}

for batch_name, df_batch in corrupted_batches.items():

    df_batch_clean = clean_corrupted_data(df_batch)

    X_corrupt = df_batch_clean["text"].values
    y_corrupt = df_batch_clean["label"].values

    # Split corrupted data (80% train, 20% test)
    X_corrupt_train, X_corrupt_test, y_corrupt_train, y_corrupt_test = train_test_split(
        X_corrupt, y_corrupt,
        test_size=0.2,
        random_state=42,
        stratify=y_corrupt
    )
    
    print(f"   Train samples: {len(X_corrupt_train)}")
    print(f"   Test samples: {len(X_corrupt_test)}")
    
    # Train new model on corrupted data
    corrupted_model = build_model()
    corrupted_model.fit(X_corrupt_train, y_corrupt_train)
    
    # Evaluate on corrupted test set
    y_pred_corrupt = corrupted_model.predict(X_corrupt_test)
    corrupt_acc = accuracy_score(y_corrupt_test, y_pred_corrupt)
    
    print(f"\n   üìä Model Trained on Corrupted Data")
    print(f"   Accuracy: {corrupt_acc:.4f}")
    
    # Store results
    corruption_results[batch_name] = {
        "accuracy": corrupt_acc,
        "train_size": len(X_corrupt_train),
        "test_size": len(X_corrupt_test),
        "predictions": y_pred_corrupt,
        "true_labels": y_corrupt_test,
        "model": corrupted_model
    }
    
    # Compare to baseline
    accuracy_drop = baseline_acc - corrupt_acc
    drop_percentage = (accuracy_drop / baseline_acc) * 100
    print(f"\n   üìâ Comparison to Baseline:")
    print(f"      Baseline Accuracy: {baseline_acc:.4f}")
    print(f"      Drop: {accuracy_drop:.4f} ({drop_percentage:.2f}%)")
    
    if drop_percentage > 10:
        print(f"      ‚ö†Ô∏è  SIGNIFICANT DROP - Corruption heavily impacts model")
    elif drop_percentage > 5:
        print(f"      ‚ö° MODERATE DROP - Corruption has noticeable impact")
    else:
        print(f"      ‚úÖ MINIMAL DROP - Model is robust to this corruption")

print(f"\n{'='*70}")
print(f"‚úÖ All corrupted batches evaluated")
print(f"{'='*70}")


CELL 4: EVALUATE MODEL ON CORRUPTED BATCHES

   üßπ Data Cleaning Summary:
      Original samples: 5000
      After NaN removal: 3500 (-1500)
      After text cleaning: 3500 (-0)
      After numeric conversion: 3500
      After bad label removal: 3500 (-0 bad labels)
      After range validation: 3500 (-0)
      ‚ö†Ô∏è  Total removed: 1500 (30.0%)
   Train samples: 2800
   Test samples: 700

   üìä Model Trained on Corrupted Data
   Accuracy: 0.5629

   üìâ Comparison to Baseline:
      Baseline Accuracy: 0.5633
      Drop: 0.0005 (0.08%)
      ‚úÖ MINIMAL DROP - Model is robust to this corruption

   üßπ Data Cleaning Summary:
      Original samples: 5000
      After NaN removal: 5000 (-0)
      After text cleaning: 5000 (-0)
      After numeric conversion: 5000
      After bad label removal: 5000 (-0 bad labels)
      After range validation: 5000 (-0)
      ‚ö†Ô∏è  Total removed: 0 (0.0%)
   Train samples: 4000
   Test samples: 1000

   üìä Model Trained on Corrupted Data
   Ac