In [1]:
# ============================
# Install dependencies
# ============================
!pip install -U -q torch torchvision transformers datasets scikit-learn pandas fairlearn evaluate openpyxl torch transformers joblib scikit-learn numpy


In [2]:
# BERT-based prediction combining text (feedback) and numeric data
# Features:
# 1) Creates pseudo-labels from 'feedback/text' column using a zero-shot classifier or keyword sentiment heuristics
# 2) Builds a PyTorch + Hugging Face BERT classifier that concatenates BERT text embedding with numeric features
# 3) Implements a simple adversarial debiasing (gradient reversal + adversary predicting a sensitive attribute)
# 4) Monitors fairness metrics using fairlearn and offers a simple reweighting mitigation
# 5) Produces CSV output files with predictions and probabilities


# --------- 2. Imports and config ----------
import os
import random
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

# For zero-shot / sentiment pseudo-labeling
from transformers import pipeline

# Fairness
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_difference, equalized_odds_difference



In [3]:
# Set seed for reproducibility
SEED = 42
try:
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

except:
  DEVICE = torch.device("cpu")

print('Device:', DEVICE)

Device: cuda


In [4]:
import os
import zipfile
import pandas as pd
from io import BytesIO
from IPython.display import display
from google.colab import files

# Upload files
uploaded = files.upload()

# Collect file paths
file_paths = []

for fname, file_obj in uploaded.items():
    if fname.lower().endswith('.zip'):
        # Extract ZIP
        zip_bytes = BytesIO(file_obj)
        with zipfile.ZipFile(zip_bytes, 'r') as zip_ref:
            extract_dir = "extracted_files"
            os.makedirs(extract_dir, exist_ok=True)
            zip_ref.extractall(extract_dir)
            for root, _, files_in_dir in os.walk(extract_dir):
                for f in files_in_dir:
                    if f.lower().endswith(('.csv', '.xlsx')):
                        file_paths.append(os.path.join(root, f))
    else:
        file_paths.append(fname)

print("\nDetected files:")
for p in file_paths:
    print(" -", p)

# Ask join keys for each file
join_info = []
for path in file_paths:
    df_sample = pd.read_csv(path) if path.lower().endswith('.csv') else pd.read_excel(path)
    print(f"\nPreview of {os.path.basename(path)}:")
    display(df_sample.head(3))
    key = input(f"Enter join key column for '{os.path.basename(path)}': ").strip()
    join_info.append((path, key))

# Ask join type
join_type = input("\nEnter join type (inner, left, right, outer): ").strip().lower()
if join_type not in ['inner', 'left', 'right', 'outer']:
    print("Invalid join type, defaulting to 'inner'")
    join_type = 'inner'

# Merge all DataFrames
merged_df = None
for i, (path, key) in enumerate(join_info):
    df = pd.read_csv(path) if path.lower().endswith('.csv') else pd.read_excel(path)
    if merged_df is None:
        merged_df = df
        main_key = key
    else:
        merged_df = pd.merge(merged_df, df, left_on=main_key, right_on=key, how=join_type)

print("\nMerged DataFrame shape:", merged_df.shape)
display(merged_df.head())


Saving customer_feedback.zip to customer_feedback.zip

Detected files:
 - extracted_files/customer_demographics.xlsx
 - extracted_files/customer_feedback.xlsx

Preview of customer_demographics.xlsx:


Unnamed: 0,CustomerID,FirstName,LastName,Gender,DOB,Age,Email,Phone,City,StateProvince,PostalCode,Country,RegistrationDate,LastPurchaseDate,LifetimeValue,NumPurchases,PreferredChannel,MaritalStatus,Occupation,IncomeBracket
0,CUST00001,Shawn,Wong,Male,1962-09-13,62,shawn.wong229@example.com,+66-33756669,Kuala Lumpur,West,65302,Singapore,2020-10-04,2023-03-18,347.38,6,Email,Partnered,Sales,40-60k
1,CUST00002,Robin,Muller,Male,1973-06-29,52,robin.muller285@example.com,+65-41429110,Ho Chi Minh City,South,30379,Indonesia,2025-05-23,2025-05-31,223.02,4,Email,Divorced,Student,40-60k
2,CUST00003,Noah,Tan,Female,1978-11-19,46,noah.tan997@example.com,+86-30576383,Tokyo,South,85674,Indonesia,2024-02-17,,0.0,0,SMS,Divorced,Teacher,60-100k


Enter join key column for 'customer_demographics.xlsx': CustomerID

Preview of customer_feedback.xlsx:


Unnamed: 0,CustomerID,Feedback
0,CUST00001,I would like to see a wider variety of payment...
1,CUST00002,My experience with the mobile app has been pos...
2,CUST00003,I value the brand’s commitment to sustainabili...


Enter join key column for 'customer_feedback.xlsx': CustomerID

Enter join type (inner, left, right, outer): inner

Merged DataFrame shape: (1000, 21)


Unnamed: 0,CustomerID,FirstName,LastName,Gender,DOB,Age,Email,Phone,City,StateProvince,...,Country,RegistrationDate,LastPurchaseDate,LifetimeValue,NumPurchases,PreferredChannel,MaritalStatus,Occupation,IncomeBracket,Feedback
0,CUST00001,Shawn,Wong,Male,1962-09-13,62,shawn.wong229@example.com,+66-33756669,Kuala Lumpur,West,...,Singapore,2020-10-04,2023-03-18,347.38,6,Email,Partnered,Sales,40-60k,I would like to see a wider variety of payment...
1,CUST00002,Robin,Muller,Male,1973-06-29,52,robin.muller285@example.com,+65-41429110,Ho Chi Minh City,South,...,Indonesia,2025-05-23,2025-05-31,223.02,4,Email,Divorced,Student,40-60k,My experience with the mobile app has been pos...
2,CUST00003,Noah,Tan,Female,1978-11-19,46,noah.tan997@example.com,+86-30576383,Tokyo,South,...,Indonesia,2024-02-17,,0.0,0,SMS,Divorced,Teacher,60-100k,I value the brand’s commitment to sustainabili...
3,CUST00004,Mia,Wong,Male,1973-10-02,51,mia.wong855@example.com,+84-41831063,Jakarta,Metro,...,Malaysia,2023-08-27,2025-01-28,148.25,4,SMS,Widowed,Consultant,20-40k,I value the brand’s commitment to sustainabili...
4,CUST00005,Emma,Muller,Male,1965-10-02,59,emma.muller787@example.com,+65-50742311,Ho Chi Minh City,East,...,Malaysia,2021-08-31,2025-02-13,233.13,5,Phone,Married,Technician,20-40k,The packaging is secure and environmentally fr...


In [5]:
# Ask user to select feedback/text columns
SENSITIVE_COLS = []
print("\nAvailable columns in merged_df:")
for idx, col in enumerate(merged_df.columns):
    print(f"{idx}: {col}")

text_col_indices = input("\nEnter column numbers for feedback/text (comma-separated, e.g., 2 or 2,5): ").strip()
text_col_indices = [int(i.strip()) for i in text_col_indices.split(",") if i.strip().isdigit()]
TEXT_COLS = [merged_df.columns[i] for i in text_col_indices]
print("Using text columns:", TEXT_COLS)

# Ask user to select sensitive attribute columns for fairness
sensitive_col_indices = input("\nEnter column numbers for sensitive attributes (comma-separated, or press Enter to skip): ").strip()
if sensitive_col_indices:
    sensitive_col_indices = [int(i.strip()) for i in sensitive_col_indices.split(",") if i.strip().isdigit()]
    SENSITIVE_COLS = [merged_df.columns[i] for i in sensitive_col_indices]
    print("Using sensitive attribute columns:", SENSITIVE_COLS)
else:
    SENSITIVE_COLS = []
    print("No sensitive attributes selected. Adversarial debiasing will be skipped.")



Available columns in merged_df:
0: CustomerID
1: FirstName
2: LastName
3: Gender
4: DOB
5: Age
6: Email
7: Phone
8: City
9: StateProvince
10: PostalCode
11: Country
12: RegistrationDate
13: LastPurchaseDate
14: LifetimeValue
15: NumPurchases
16: PreferredChannel
17: MaritalStatus
18: Occupation
19: IncomeBracket
20: Feedback

Enter column numbers for feedback/text (comma-separated, e.g., 2 or 2,5): 20
Using text columns: ['Feedback']

Enter column numbers for sensitive attributes (comma-separated, or press Enter to skip): 0,3,4,5,6,7,8,10,11,17,18,19
Using sensitive attribute columns: ['CustomerID', 'Gender', 'DOB', 'Age', 'Email', 'Phone', 'City', 'PostalCode', 'Country', 'MaritalStatus', 'Occupation', 'IncomeBracket']


In [6]:
# Ask user to select target/label column or create one
print("\nAvailable columns in merged_df:")
for idx, col in enumerate(merged_df.columns):
    print(f"{idx}: {col}")

label_choice = input("\nEnter column number for final target column (or press Enter if no column exists): ").strip()

if label_choice:
    LABEL_COL = merged_df.columns[int(label_choice)]
    print("Using existing column as target column:", LABEL_COL)
else:
    LABEL_COL = 'pseudo_target'
    print("No target column selected. Creating target column as ", LABEL_COL)

    # Strategy: try zero-shot classification ("will stay" vs "will leave")
    try:
        from transformers import pipeline
        print('Initializing zero-shot pipeline (this may download a model) ...')
        zsp = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
        candidate_labels = ['will churn/leave the service', 'will stay/continue using the service']

        def zs_label(text):
            if not isinstance(text, str) or text.strip() == '':
                return 0  # default: stay
            out = zsp(text, candidate_labels)
            top = out['labels'][0]
            return 1 if 'leave' in top.lower() or 'churn' in top.lower() else 0

        print('Applying zero-shot classifier to feedback (this may take a while).')
        merged_df[LABEL_COL] = merged_df[TEXT_COLS[0]].fillna('').astype(str).apply(zs_label)

    except Exception as e:
        print('Zero-shot pipeline failed (', e, '). Falling back to heuristic keyword rules.')
        negative_keywords = ['cancel', 'churn', 'leave', 'switch', 'stop', 'uninstall',
                              'refund', 'complain', 'close account', 'sue', 'angry']

        def heuristic_label(text):
            t = '' if not isinstance(text, str) else text.lower()
            if any(k in t for k in negative_keywords):
                return 1
            return 0

        merged_df[LABEL_COL] = merged_df[TEXT_COLS[0]].fillna('').astype(str).apply(heuristic_label)

print("\nLabel distribution:")
print(merged_df[LABEL_COL].value_counts(dropna=False))



Available columns in merged_df:
0: CustomerID
1: FirstName
2: LastName
3: Gender
4: DOB
5: Age
6: Email
7: Phone
8: City
9: StateProvince
10: PostalCode
11: Country
12: RegistrationDate
13: LastPurchaseDate
14: LifetimeValue
15: NumPurchases
16: PreferredChannel
17: MaritalStatus
18: Occupation
19: IncomeBracket
20: Feedback

Enter column number for final target column (or press Enter if no column exists): 
No target column selected. Creating target column as  pseudo_target
Initializing zero-shot pipeline (this may download a model) ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Applying zero-shot classifier to feedback (this may take a while).


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Label distribution:
pseudo_target
0    1000
Name: count, dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

# --------- Prepare features: numerical cols + encoding ----------
# Build exclude list from text, label, and sensitive columns
exclude = set(TEXT_COLS)  # unpack text columns
exclude.add(LABEL_COL)    # add label column
if SENSITIVE_COLS:
    exclude.update(SENSITIVE_COLS)  # unpack sensitive columns

# Select numeric columns (excluding ID, text, label, sensitive)
numeric_cols = [c for c in merged_df.columns if c not in exclude and pd.api.types.is_numeric_dtype(merged_df[c])]
print('Numeric columns used:', numeric_cols)

# Categorical columns (non-text, non-label, non-numeric)
categorical_cols = [c for c in merged_df.columns
                    if c not in exclude
                    and c not in numeric_cols
                    and merged_df[c].dtype == object]
print('Categorical columns:', categorical_cols)

# Preprocessing pipeline
proc_df = merged_df.copy()

# Fill missing numeric
for c in numeric_cols:
    proc_df[c] = proc_df[c].fillna(proc_df[c].median())

# Encode categorical
for c in categorical_cols:
    proc_df[c] = proc_df[c].fillna('missing').astype(str)
    proc_df[c] = proc_df[c].astype('category').cat.codes

FEATURE_COLS = numeric_cols + categorical_cols
print('Final feature columns to be used as numeric inputs:', FEATURE_COLS)

# Scale features
scaler = StandardScaler()
if len(FEATURE_COLS) > 0:
    proc_df[FEATURE_COLS] = scaler.fit_transform(proc_df[FEATURE_COLS])

# Sensitive attribute processing (if present)
if SENSITIVE_COLS:
    for col in SENSITIVE_COLS:
        proc_df[col] = proc_df[col].fillna('missing').astype(str).astype('category').cat.codes

# Drop rows with missing text if needed
for col in TEXT_COLS:
    proc_df[col] = proc_df[col].fillna('')


Numeric columns used: ['LifetimeValue', 'NumPurchases']
Categorical columns: ['FirstName', 'LastName', 'StateProvince', 'RegistrationDate', 'LastPurchaseDate', 'PreferredChannel']
Final feature columns to be used as numeric inputs: ['LifetimeValue', 'NumPurchases', 'FirstName', 'LastName', 'StateProvince', 'RegistrationDate', 'LastPurchaseDate', 'PreferredChannel']


In [8]:

# --------- Train / Val split ----------
train_df, val_df = train_test_split(proc_df, test_size=0.15, random_state=SEED, stratify=proc_df[LABEL_COL] if proc_df[LABEL_COL].nunique()>1 else None)
print('Train size:', len(train_df), 'Val size:', len(val_df))


Train size: 850 Val size: 150


In [9]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from transformers import AutoTokenizer

# --------- Dataset and DataLoader ----------
MODEL_NAME = 'bert-base-uncased'  # Change if needed
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
num_epochs=3
MAX_LEN = 256
BATCH_SIZE = 16

class ChurnDataset(Dataset):
    def __init__(self, df):
        # Combine multiple text columns into one string
        if len(TEXT_COLS) > 1:
            self.texts = df[TEXT_COLS].astype(str).agg(" ".join, axis=1).tolist()
        else:
            self.texts = df[TEXT_COLS[0]].astype(str).tolist()

        # Numeric feature array
        self.X = df[FEATURE_COLS].values.astype(np.float32) if len(FEATURE_COLS) > 0 else np.zeros((len(df), 0), dtype=np.float32)

        # Labels
        self.y = df[LABEL_COL].astype(int).values

        # Sensitive attributes (combined if multiple)
        if SENSITIVE_COLS:
            self.sens = df[SENSITIVE_COLS].astype(str).agg("-".join, axis=1).astype("category").cat.codes.values
        else:
            self.sens = None

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        txt = self.texts[idx]
        enc = TOKENIZER(txt, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['numeric'] = torch.tensor(self.X[idx], dtype=torch.float32)
        item['labels'] = torch.tensor(int(self.y[idx]), dtype=torch.long)
        if self.sens is not None:
            item['sens'] = torch.tensor(int(self.sens[idx]), dtype=torch.long)
        return item

# Example usage:
train_ds = ChurnDataset(train_df)
val_ds = ChurnDataset(val_df)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
# ---------Model: BERT + numeric MLP + adversary ----------

class GradientReversalFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None


class GradientReversalLayer(nn.Module):
    def __init__(self, alpha=1.0):
        super().__init__()
        self.alpha = alpha

    def forward(self, x):
        return GradientReversalFunction.apply(x, self.alpha)


class BertWithNumericAdversary(nn.Module):
    def __init__(self, model_name, numeric_dim, num_sensitive_classes=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Classifier head for main task
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + numeric_dim, max(hidden_size // 2, 32)),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(max(hidden_size // 2, 32), 2)
        )

        # Adversary head for sensitive attribute prediction
        self.num_sensitive_classes = num_sensitive_classes
        if num_sensitive_classes:
            self.grl = GradientReversalLayer(alpha=1.0)
            self.adversary = nn.Sequential(
                nn.Linear(hidden_size, max(hidden_size // 2, 32)),
                nn.ReLU(),
                nn.Linear(max(hidden_size // 2, 32), num_sensitive_classes)
            )

    def forward(self, input_ids, attention_mask, numeric_tensor=None, return_repr=False):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # pooled output from BERT (CLS token)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            # mean pooling as fallback
            last = out.last_hidden_state
            pooled = (last * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        # Adversary prediction
        adv_logits = None
        if self.num_sensitive_classes:
            adv_in = self.grl(pooled)
            adv_logits = self.adversary(adv_in)

        # Concatenate numeric features
        if numeric_tensor is None or numeric_tensor.shape[1] == 0:
            combined = pooled
        else:
            combined = torch.cat([pooled, numeric_tensor], dim=1)

        logits = self.classifier(combined)

        if return_repr:
            return logits, adv_logits, pooled
        return logits, adv_logits


# Create model
numeric_dim = len(FEATURE_COLS)

if SENSITIVE_COLS:
    if isinstance(SENSITIVE_COLS, (list, tuple)):
        num_sensitive = proc_df[SENSITIVE_COLS[0]].nunique()
    else:
        num_sensitive = proc_df[SENSITIVE_COLS].nunique()
    num_sensitive = int(num_sensitive)
else:
    num_sensitive = None

model = BertWithNumericAdversary(
    MODEL_NAME,
    numeric_dim=numeric_dim,
    num_sensitive_classes=num_sensitive
).to(DEVICE)

# Losses and optimizer
criterion = nn.CrossEntropyLoss()
adv_criterion = nn.CrossEntropyLoss() if SENSITIVE_COLS else None
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Scheduler


total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:

# ---------Training loop (with adversarial debiasing) ----------
print('Starting training...')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        numeric = batch['numeric'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
        loss = criterion(logits, labels)
        # adversarial loss: we want adversary to be bad at predicting sensitive attribute -> maximize adversary loss
        if SENSITIVE_COLS:
            sens = batch['sens'].to(DEVICE)
            adv_loss = adv_criterion(adv_logits, sens)
            # combine: minimize classification loss + lambda * (-adv_loss) i.e., maximize adv_loss
            lambda_adv = 0.5  # tunable
            combined_loss = loss - lambda_adv * adv_loss
            combined_loss.backward()
            total_loss += combined_loss.item()
        else:
            loss.backward()
            total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, avg loss: {avg_loss:.4f}')

    # Validation
    model.eval()
    ys, preds, probs = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            numeric = batch['numeric'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
            p = torch.softmax(logits, dim=1)[:,1].detach().cpu().numpy()
            predicted = (p >= 0.5).astype(int)
            ys.extend(labels.detach().cpu().numpy().tolist())
            preds.extend(predicted.tolist())
            probs.extend(p.tolist())
    print('Val Acc:', accuracy_score(ys, preds), 'F1:', f1_score(ys, preds, zero_division=0))


Starting training...
Epoch 1/3, avg loss: -3.1951
Val Acc: 1.0 F1: 0.0
Epoch 2/3, avg loss: -3.4325
Val Acc: 1.0 F1: 0.0
Epoch 3/3, avg loss: -3.4456
Val Acc: 1.0 F1: 0.0


In [12]:
# --------- Fairness evaluation & simple mitigation ----------
import numpy as np
import pandas as pd
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Ensure SENSITIVE_COLS has no duplicates
if SENSITIVE_COLS:
    SENSITIVE_COLS = list(dict.fromkeys(SENSITIVE_COLS))

# Remove duplicate columns in DataFrames
val_df_copy = val_df.copy()
val_df_copy = val_df_copy.loc[:, ~val_df_copy.columns.duplicated()]
train_df = train_df.loc[:, ~train_df.columns.duplicated()]

# Add predictions to validation DataFrame
val_df_copy['pred_prob'] = probs
val_df_copy['pred_label'] = preds

if SENSITIVE_COLS:
    # Handle single vs multi sensitive columns
    if len(SENSITIVE_COLS) == 1:
        group = val_df_copy[SENSITIVE_COLS[0]]
        train_group = train_df[SENSITIVE_COLS[0]]
    else:
        # Combine multiple sensitive attributes into a single tuple key
        group = val_df_copy[SENSITIVE_COLS].astype(str).agg('-'.join, axis=1)
        train_group = train_df[SENSITIVE_COLS].astype(str).agg('-'.join, axis=1)

    # Compute metrics
    metric_frame = MetricFrame(
        metrics={
            'accuracy': accuracy_score,
            'precision': precision_score,
            'recall': recall_score,
            'selection_rate': selection_rate
        },
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )

    print('Per-group metrics:')
    print(metric_frame.by_group)

    # Fairness differences
    dp_diff = demographic_parity_difference(
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )
    eqod = equalized_odds_difference(
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )
    print('Demographic parity difference:', dp_diff)
    print('Equalized odds difference:', eqod)

    # Simple reweighting mitigation
    weights = np.ones(len(train_df))
    groups = np.unique(train_group)
    target = 1.0 / len(groups)  # uniform target distribution

    for gi in groups:
        idxs = (train_group == gi).values
        cur = idxs.sum() / len(train_group)
        if cur > 0:
            weights[idxs] = target / cur

    print('Example reweighting achieved. You may re-train using sample weights for fairness mitigation.')

else:
    print('No sensitive attribute available. Fairness monitoring skipped. '
          'You can set SENSITIVE_COL manually to a column name and re-run.')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])

Per-group metrics:
                                    accuracy  precision  recall  \
sensitive_feature_0                                               
10-0-8-61-796-338-1-735-1-2-1-0          1.0        0.0     0.0   
101-0-546-28-1-286-5-94-5-2-5-0          1.0        0.0     0.0   
107-1-909-4-588-463-9-718-6-0-7-1        1.0        0.0     0.0   
120-1-61-58-587-654-7-654-4-1-1-3        1.0        0.0     0.0   
136-1-221-49-487-817-9-809-0-1-8-1       1.0        0.0     0.0   
...                                      ...        ...     ...   
978-0-660-21-315-398-0-831-2-3-2-2       1.0        0.0     0.0   
985-0-854-7-482-4-9-846-9-2-6-3          1.0        0.0     0.0   
986-0-583-26-460-32-8-878-3-1-8-0        1.0        0.0     0.0   
995-0-138-53-64-282-4-793-5-1-3-2        1.0        0.0     0.0   
998-1-790-12-62-29-9-261-9-4-1-0         1.0        0.0     0.0   

                                    selection_rate  
sensitive_feature_0                                 
10-

In [13]:

# --------- Produce deliverable prediction CSVs ----------
# Use model to predict on the full dataframe and save outputs
full_ds = ChurnDataset(proc_df)
full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE)
all_probs = []
all_preds = []
model.eval()
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        numeric = batch['numeric'].to(DEVICE)
        logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
        p = torch.softmax(logits, dim=1)[:,1].detach().cpu().numpy()
        predicted = (p >= 0.5).astype(int)
        all_probs.extend(p.tolist())
        all_preds.extend(predicted.tolist())

out_df = df.copy()
out_df['pred_churn_prob'] = all_probs
#out_df['pred_churn_label'] = all_preds

PRED_CSV = '/content/churn_predictions.csv'
out_df.to_csv(PRED_CSV, index=False)
print('Full Predictions Data Saved To', PRED_CSV)


Full Predictions Data Saved To /content/churn_predictions.csv


In [20]:
import joblib
from transformers import AutoTokenizer
import os
import torch


# ===========================
# Define where to save model
# ===========================
save_dir = input("Enter directory path to save model: ").strip()
if not save_dir:
    save_dir = "./trained_model"
os.makedirs(save_dir, exist_ok=True)


# ===========================
# Update MODEL_NAME to point to saved model
# ===========================
MODEL_PATH_NAME = f"{save_dir}/model.pkl" # Changed extension to .pth
TOKENISER_PATH_NAME =  f"{save_dir}/tokenizer" # Saved as directory
SCALER_PATH_NAME = f"{save_dir}/numeric_scaler.pkl"
# ===========================
# Save model
# ===========================
if 'model' not in globals():
    raise ValueError("Trained model object 'model' not found in memory. Please run training first.")
# Use torch.save to save the model's state_dict
torch.save(model, MODEL_PATH_NAME)

# ===========================
# Ensure tokenizer exists (prefer the one used in training)
# ===========================
if 'TOKENIZER' in globals():
    tokenizer = TOKENIZER
elif 'tokenizer' in globals():
    tokenizer = tokenizer
else:
    # If no tokenizer object in memory, try loading from MODEL_NAME if defined
    if 'MODEL_NAME' in globals():
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    else:
        raise ValueError("No tokenizer found. Ensure you have trained or loaded a model before saving.")

# Save tokenizer from training
# Use save_pretrained for Hugging Face tokenizers
tokenizer.save_pretrained(TOKENISER_PATH_NAME)

# ===========================
# Ensure scaler exists
# ===========================
if 'scaler' not in globals():
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    if 'proc_df' in globals() and 'FEATURE_COLS' in globals() and len(FEATURE_COLS) > 0:
        scaler.fit(proc_df[FEATURE_COLS])  # Fit if data is available
    else:
        print("Warning: No numeric features found. Creating an unfitted scaler.")

# Save numeric scaler
joblib.dump(scaler, SCALER_PATH_NAME)


print(f" Model, tokenizer, and scaler saved to: {save_dir}")
print(f" You can now load them with: AutoTokenizer.from_pretrained('{save_dir}')") # Note: Loading the model requires instantiating the class and loading the state_dict

Enter directory path to save model: /content/trained_outputs
 Model, tokenizer, and scaler saved to: /content/trained_outputs
 You can now load them with: AutoTokenizer.from_pretrained('/content/trained_outputs')
