In [7]:
# ============================
# Install dependencies
# ============================
! pip install -U -qq torch torchvision transformers datasets scikit-learn pandas fairlearn evaluate openpyxl ipywidgets jupyterlab_widgets notebook

In [2]:
# BERT-based prediction combining text (feedback) and numeric data
# Features:
# 1) Creates pseudo-labels from 'feedback/text' column using a zero-shot classifier or keyword sentiment heuristics
# 2) Builds a PyTorch + Hugging Face BERT classifier that concatenates BERT text embedding with numeric features
# 3) Implements a simple adversarial debiasing (gradient reversal + adversary predicting a sensitive attribute)
# 4) Monitors fairness metrics using fairlearn and offers a simple reweighting mitigation
# 5) Produces CSV output files with predictions and probabilities


# --------- 2. Imports and config ----------
import os
import random
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

# For zero-shot / sentiment pseudo-labeling
from transformers import pipeline

# Fairness
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_difference, equalized_odds_difference

import ipywidgets as widgets
from IPython.display import display


In [3]:
# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x18d62bea4f0>

In [4]:
import os
import zipfile
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# Directory for extracted/processed files
extract_dir = "extracted_files"
os.makedirs(extract_dir, exist_ok=True)

print(" Please upload your files (.zip, .csv, .xlsx).")
print(" If ZIP: It will be extracted automatically.")
print(" You can select multiple files at once.")

# Widgets
uploader = widgets.FileUpload(accept='.zip,.csv,.xlsx', multiple=True)
ui_area = widgets.Output()        # For join key dropdowns & merge button
results_area = widgets.Output()   # For showing merge results
selection_area = widgets.Output() # For selecting text/sensitive columns

# Global storage
dfs = {}
file_paths = {}
merged_df = None
TEXT_COLS = []
SENSITIVE_COLS = []

def auto_process(change):
    with ui_area:
        clear_output()
    with results_area:
        clear_output()
    with selection_area:
        clear_output()

    global dfs, file_paths
    dfs = {}
    file_paths = {}

    # Clear old extracted files
    for root, _, files in os.walk(extract_dir):
        for f in files:
            os.remove(os.path.join(root, f))

    # Save uploaded files
    for fileinfo in uploader.value:
        filename = fileinfo['name']
        content = fileinfo['content']
        local_path = os.path.join(extract_dir, filename)
        with open(local_path, 'wb') as f:
            f.write(content)
        print(f" File {filename} uploaded successfully!")

        # If ZIP, extract and remove original
        if filename.lower().endswith('.zip'):
            with zipfile.ZipFile(local_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            os.remove(local_path)

    # Read CSV/XLSX files
    for root, _, files_in_dir in os.walk(extract_dir):
        for f in files_in_dir:
            if f.lower().endswith(('.csv', '.xlsx')):
                path = os.path.join(root, f)
                file_paths[path] = f
                if path.lower().endswith('.csv'):
                    dfs[path] = pd.read_csv(path)
                else:
                    dfs[path] = pd.read_excel(path)

    if not file_paths:
        with results_area:
            print(" No CSV/XLSX files found.")
        return

    # Join key selectors
    join_key_widgets = {}
    for path in file_paths:
        cols = dfs[path].columns.tolist()
        dropdown = widgets.Dropdown(
            options=cols,
            description=os.path.basename(path),
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='50%')
        )
        join_key_widgets[path] = dropdown

    join_type_dropdown = widgets.Dropdown(
        options=['inner', 'left', 'right', 'outer'],
        value='inner',
        description='Join Type:',
        style={'description_width': 'initial'}
    )

    merge_button = widgets.Button(description=" Merge Files", button_style='success')

    def merge_action(b):
        with results_area:
            clear_output()
            global merged_df
            join_info = [(path, dropdown.value) for path, dropdown in join_key_widgets.items()]
            join_type = join_type_dropdown.value

            merged_df = None
            for i, (path, key) in enumerate(join_info):
                df = dfs[path]
                if merged_df is None:
                    merged_df = df
                    main_key = key
                else:
                    merged_df = pd.merge(merged_df, df, left_on=main_key, right_on=key, how=join_type)

            print("\n Merged DataFrame shape:", merged_df.shape)
            display(merged_df.head())

        with selection_area:
            clear_output()

            if merged_df is None or merged_df.empty:
                print(" No merged data to select from.")
                return

            col_options = list(merged_df.columns)

            text_col_selector = widgets.SelectMultiple(
                options=col_options,
                description="Text Columns",
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='50%')
            )

            sensitive_col_selector = widgets.SelectMultiple(
                options=col_options,
                description="Sensitive Columns",
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='50%')
            )

            confirm_button = widgets.Button(description="Confirm Selections", button_style='info')

            def confirm_action(_):
                global TEXT_COLS, SENSITIVE_COLS
                TEXT_COLS = list(text_col_selector.value)
                SENSITIVE_COLS = list(sensitive_col_selector.value)

                with selection_area:
                    clear_output()
                    print("Using text columns:", TEXT_COLS)
                    if SENSITIVE_COLS:
                        print("Using sensitive attribute columns:", SENSITIVE_COLS)
                    else:
                        print("No sensitive attributes selected. Adversarial debiasing will be skipped.")

            confirm_button.on_click(confirm_action)

            display(widgets.VBox([text_col_selector, sensitive_col_selector, confirm_button]))

    merge_button.on_click(merge_action)

    with ui_area:
        display(widgets.VBox(list(join_key_widgets.values()) + [join_type_dropdown, merge_button]))

uploader.observe(auto_process, names='value')

display(uploader, ui_area, results_area, selection_area)


 Please upload your files (.zip, .csv, .xlsx).
 If ZIP: It will be extracted automatically.
 You can select multiple files at once.


FileUpload(value=(), accept='.zip,.csv,.xlsx', description='Upload', multiple=True)

Output()

Output()

Output()

In [5]:
'''
# Step X: Ask user to select feedback/text columns
SENSITIVE_COLS = []
print("\nAvailable columns in merged_df:")
for idx, col in enumerate(merged_df.columns):
    print(f"{idx}: {col}")

text_col_indices = input("\nEnter column numbers for feedback/text (comma-separated, e.g., 2 or 2,5): ").strip()
text_col_indices = [int(i.strip()) for i in text_col_indices.split(",") if i.strip().isdigit()]
TEXT_COLS = [merged_df.columns[i] for i in text_col_indices]
print("Using text columns:", TEXT_COLS)

# Step X: Ask user to select sensitive attribute columns for fairness
sensitive_col_indices = input("\nEnter column numbers for sensitive attributes (comma-separated, or press Enter to skip): ").strip()
if sensitive_col_indices:
    sensitive_col_indices = [int(i.strip()) for i in sensitive_col_indices.split(",") if i.strip().isdigit()]
    SENSITIVE_COLS = [merged_df.columns[i] for i in sensitive_col_indices]
    print("Using sensitive attribute columns:", SENSITIVE_COLS)
else:
    SENSITIVE_COLS = []
    print("No sensitive attributes selected. Adversarial debiasing will be skipped.")
'''

'\n# Step X: Ask user to select feedback/text columns\nSENSITIVE_COLS = []\nprint("\nAvailable columns in merged_df:")\nfor idx, col in enumerate(merged_df.columns):\n    print(f"{idx}: {col}")\n\ntext_col_indices = input("\nEnter column numbers for feedback/text (comma-separated, e.g., 2 or 2,5): ").strip()\ntext_col_indices = [int(i.strip()) for i in text_col_indices.split(",") if i.strip().isdigit()]\nTEXT_COLS = [merged_df.columns[i] for i in text_col_indices]\nprint("Using text columns:", TEXT_COLS)\n\n# Step X: Ask user to select sensitive attribute columns for fairness\nsensitive_col_indices = input("\nEnter column numbers for sensitive attributes (comma-separated, or press Enter to skip): ").strip()\nif sensitive_col_indices:\n    sensitive_col_indices = [int(i.strip()) for i in sensitive_col_indices.split(",") if i.strip().isdigit()]\n    SENSITIVE_COLS = [merged_df.columns[i] for i in sensitive_col_indices]\n    print("Using sensitive attribute columns:", SENSITIVE_COLS)\nels

In [6]:
# Ask user to select target/label column or create one
print("\nAvailable columns in merged_df:")
for idx, col in enumerate(merged_df.columns):
    print(f"{idx}: {col}")

label_choice = input("\nEnter column number for final target column (or press Enter if no column exists): ").strip()

if label_choice:
    LABEL_COL = merged_df.columns[int(label_choice)]
    print("Using existing column as target column:", LABEL_COL)
else:
    LABEL_COL = 'pseudo_target'
    print("No target column selected. Creating target column as ", LABEL_COL)

    # Strategy: try zero-shot classification ("will stay" vs "will leave")
    try:
        from transformers import pipeline
        print('Initializing zero-shot pipeline (this may download a model) ...')
        zsp = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
        candidate_labels = ['will churn/leave the service', 'will stay/continue using the service']

        def zs_label(text):
            if not isinstance(text, str) or text.strip() == '':
                return 0  # default: stay
            out = zsp(text, candidate_labels)
            top = out['labels'][0]
            return 1 if 'leave' in top.lower() or 'churn' in top.lower() else 0

        print('Applying zero-shot classifier to feedback (this may take a while).')
        merged_df[LABEL_COL] = merged_df[TEXT_COLS[0]].fillna('').astype(str).apply(zs_label)

    except Exception as e:
        print('Zero-shot pipeline failed (', e, '). Falling back to heuristic keyword rules.')
        negative_keywords = ['cancel', 'churn', 'leave', 'switch', 'stop', 'uninstall',
                              'refund', 'complain', 'close account', 'sue', 'angry']

        def heuristic_label(text):
            t = '' if not isinstance(text, str) else text.lower()
            if any(k in t for k in negative_keywords):
                return 1
            return 0

        merged_df[LABEL_COL] = merged_df[TEXT_COLS[0]].fillna('').astype(str).apply(heuristic_label)

print("\nLabel distribution:")
print(merged_df[LABEL_COL].value_counts(dropna=False))



Available columns in merged_df:


AttributeError: 'NoneType' object has no attribute 'columns'

In [None]:
from sklearn.preprocessing import StandardScaler

# --------- Prepare features: numerical cols + encoding ----------
# Build exclude list from text, label, and sensitive columns
exclude = set(TEXT_COLS)  # unpack text columns
exclude.add(LABEL_COL)    # add label column
if SENSITIVE_COLS:
    exclude.update(SENSITIVE_COLS)  # unpack sensitive columns

# Select numeric columns (excluding ID, text, label, sensitive)
numeric_cols = [c for c in merged_df.columns if c not in exclude and pd.api.types.is_numeric_dtype(merged_df[c])]
print('Numeric columns used:', numeric_cols)

# Categorical columns (non-text, non-label, non-numeric)
categorical_cols = [c for c in merged_df.columns
                    if c not in exclude
                    and c not in numeric_cols
                    and merged_df[c].dtype == object]
print('Categorical columns:', categorical_cols)

# Preprocessing pipeline
proc_df = merged_df.copy()

# Fill missing numeric
for c in numeric_cols:
    proc_df[c] = proc_df[c].fillna(proc_df[c].median())

# Encode categorical
for c in categorical_cols:
    proc_df[c] = proc_df[c].fillna('missing').astype(str)
    proc_df[c] = proc_df[c].astype('category').cat.codes

FEATURE_COLS = numeric_cols + categorical_cols
print('Final feature columns to be used as numeric inputs:', FEATURE_COLS)

# Scale features
scaler = StandardScaler()
if len(FEATURE_COLS) > 0:
    proc_df[FEATURE_COLS] = scaler.fit_transform(proc_df[FEATURE_COLS])

# Sensitive attribute processing (if present)
if SENSITIVE_COLS:
    for col in SENSITIVE_COLS:
        proc_df[col] = proc_df[col].fillna('missing').astype(str).astype('category').cat.codes

# Drop rows with missing text if needed
for col in TEXT_COLS:
    proc_df[col] = proc_df[col].fillna('')


In [None]:

# --------- Train / Val split ----------
train_df, val_df = train_test_split(proc_df, test_size=0.15, random_state=SEED, stratify=proc_df[LABEL_COL] if proc_df[LABEL_COL].nunique()>1 else None)
print('Train size:', len(train_df), 'Val size:', len(val_df))


In [None]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from transformers import AutoTokenizer

# --------- Dataset and DataLoader ----------
MODEL_NAME = 'bert-base-uncased'  # Change if needed
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
num_epochs=3
MAX_LEN = 256
BATCH_SIZE = 16

class ChurnDataset(Dataset):
    def __init__(self, df):
        # Combine multiple text columns into one string
        if len(TEXT_COLS) > 1:
            self.texts = df[TEXT_COLS].astype(str).agg(" ".join, axis=1).tolist()
        else:
            self.texts = df[TEXT_COLS[0]].astype(str).tolist()

        # Numeric feature array
        self.X = df[FEATURE_COLS].values.astype(np.float32) if len(FEATURE_COLS) > 0 else np.zeros((len(df), 0), dtype=np.float32)

        # Labels
        self.y = df[LABEL_COL].astype(int).values

        # Sensitive attributes (combined if multiple)
        if SENSITIVE_COLS:
            self.sens = df[SENSITIVE_COLS].astype(str).agg("-".join, axis=1).astype("category").cat.codes.values
        else:
            self.sens = None

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        txt = self.texts[idx]
        enc = TOKENIZER(txt, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['numeric'] = torch.tensor(self.X[idx], dtype=torch.float32)
        item['labels'] = torch.tensor(int(self.y[idx]), dtype=torch.long)
        if self.sens is not None:
            item['sens'] = torch.tensor(int(self.sens[idx]), dtype=torch.long)
        return item

# Example usage:
train_ds = ChurnDataset(train_df)
val_ds = ChurnDataset(val_df)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)


In [None]:
# ---------Model: BERT + numeric MLP + adversary ----------

class GradientReversalFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None


class GradientReversalLayer(nn.Module):
    def __init__(self, alpha=1.0):
        super().__init__()
        self.alpha = alpha

    def forward(self, x):
        return GradientReversalFunction.apply(x, self.alpha)


class BertWithNumericAdversary(nn.Module):
    def __init__(self, model_name, numeric_dim, num_sensitive_classes=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Classifier head for main task
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + numeric_dim, max(hidden_size // 2, 32)),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(max(hidden_size // 2, 32), 2)
        )

        # Adversary head for sensitive attribute prediction
        self.num_sensitive_classes = num_sensitive_classes
        if num_sensitive_classes:
            self.grl = GradientReversalLayer(alpha=1.0)
            self.adversary = nn.Sequential(
                nn.Linear(hidden_size, max(hidden_size // 2, 32)),
                nn.ReLU(),
                nn.Linear(max(hidden_size // 2, 32), num_sensitive_classes)
            )

    def forward(self, input_ids, attention_mask, numeric_tensor=None, return_repr=False):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # pooled output from BERT (CLS token)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            pooled = out.pooler_output
        else:
            # mean pooling as fallback
            last = out.last_hidden_state
            pooled = (last * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)

        # Adversary prediction
        adv_logits = None
        if self.num_sensitive_classes:
            adv_in = self.grl(pooled)
            adv_logits = self.adversary(adv_in)

        # Concatenate numeric features
        if numeric_tensor is None or numeric_tensor.shape[1] == 0:
            combined = pooled
        else:
            combined = torch.cat([pooled, numeric_tensor], dim=1)

        logits = self.classifier(combined)

        if return_repr:
            return logits, adv_logits, pooled
        return logits, adv_logits


# Create model
numeric_dim = len(FEATURE_COLS)

if SENSITIVE_COLS:
    if isinstance(SENSITIVE_COLS, (list, tuple)):
        num_sensitive = proc_df[SENSITIVE_COLS[0]].nunique()
    else:
        num_sensitive = proc_df[SENSITIVE_COLS].nunique()
    num_sensitive = int(num_sensitive)
else:
    num_sensitive = None

model = BertWithNumericAdversary(
    MODEL_NAME,
    numeric_dim=numeric_dim,
    num_sensitive_classes=num_sensitive
).to(DEVICE)

# Losses and optimizer
criterion = nn.CrossEntropyLoss()
adv_criterion = nn.CrossEntropyLoss() if SENSITIVE_COLS else None
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Scheduler


total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


In [None]:

# ---------Training loop (with adversarial debiasing) ----------
print('Starting training...')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        numeric = batch['numeric'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
        loss = criterion(logits, labels)
        # adversarial loss: we want adversary to be bad at predicting sensitive attribute -> maximize adversary loss
        if SENSITIVE_COLS:
            sens = batch['sens'].to(DEVICE)
            adv_loss = adv_criterion(adv_logits, sens)
            # combine: minimize classification loss + lambda * (-adv_loss) i.e., maximize adv_loss
            lambda_adv = 0.5  # tunable
            combined_loss = loss - lambda_adv * adv_loss
            combined_loss.backward()
            total_loss += combined_loss.item()
        else:
            loss.backward()
            total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, avg loss: {avg_loss:.4f}')

    # Validation
    model.eval()
    ys, preds, probs = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            numeric = batch['numeric'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
            p = torch.softmax(logits, dim=1)[:,1].detach().cpu().numpy()
            predicted = (p >= 0.5).astype(int)
            ys.extend(labels.detach().cpu().numpy().tolist())
            preds.extend(predicted.tolist())
            probs.extend(p.tolist())
    print('Val Acc:', accuracy_score(ys, preds), 'F1:', f1_score(ys, preds, zero_division=0))


In [None]:
# --------- Fairness evaluation & simple mitigation ----------
import numpy as np
import pandas as pd
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Ensure SENSITIVE_COLS has no duplicates
if SENSITIVE_COLS:
    SENSITIVE_COLS = list(dict.fromkeys(SENSITIVE_COLS))

# Remove duplicate columns in DataFrames
val_df_copy = val_df.copy()
val_df_copy = val_df_copy.loc[:, ~val_df_copy.columns.duplicated()]
train_df = train_df.loc[:, ~train_df.columns.duplicated()]

# Add predictions to validation DataFrame
val_df_copy['pred_prob'] = probs
val_df_copy['pred_label'] = preds

if SENSITIVE_COLS:
    # Handle single vs multi sensitive columns
    if len(SENSITIVE_COLS) == 1:
        group = val_df_copy[SENSITIVE_COLS[0]]
        train_group = train_df[SENSITIVE_COLS[0]]
    else:
        # Combine multiple sensitive attributes into a single tuple key
        group = val_df_copy[SENSITIVE_COLS].astype(str).agg('-'.join, axis=1)
        train_group = train_df[SENSITIVE_COLS].astype(str).agg('-'.join, axis=1)

    # Compute metrics
    metric_frame = MetricFrame(
        metrics={
            'accuracy': accuracy_score,
            'precision': precision_score,
            'recall': recall_score,
            'selection_rate': selection_rate
        },
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )

    print('Per-group metrics:')
    print(metric_frame.by_group)

    # Fairness differences
    dp_diff = demographic_parity_difference(
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )
    eqod = equalized_odds_difference(
        y_true=val_df_copy[LABEL_COL],
        y_pred=val_df_copy['pred_label'],
        sensitive_features=group
    )
    print('Demographic parity difference:', dp_diff)
    print('Equalized odds difference:', eqod)

    # Simple reweighting mitigation
    weights = np.ones(len(train_df))
    groups = np.unique(train_group)
    target = 1.0 / len(groups)  # uniform target distribution

    for gi in groups:
        idxs = (train_group == gi).values
        cur = idxs.sum() / len(train_group)
        if cur > 0:
            weights[idxs] = target / cur

    print('Example reweighting achieved. You may re-train using sample weights for fairness mitigation.')

else:
    print('No sensitive attribute available. Fairness monitoring skipped. '
          'You can set SENSITIVE_COL manually to a column name and re-run.')


In [None]:

# --------- Produce deliverable prediction CSVs ----------
# Use model to predict on the full dataframe and save outputs
full_ds = ChurnDataset(proc_df)
full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE)
all_probs = []
all_preds = []
model.eval()
with torch.no_grad():
    for batch in full_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        numeric = batch['numeric'].to(DEVICE)
        logits, adv_logits = model(input_ids=input_ids, attention_mask=attention_mask, numeric_tensor=numeric)
        p = torch.softmax(logits, dim=1)[:,1].detach().cpu().numpy()
        predicted = (p >= 0.5).astype(int)
        all_probs.extend(p.tolist())
        all_preds.extend(predicted.tolist())

out_df = df.copy()
out_df['pred_churn_prob'] = all_probs
#out_df['pred_churn_label'] = all_preds

PRED_CSV = '/content/churn_predictions.csv'
out_df.to_csv(PRED_CSV, index=False)
print('Full Predictions Data Saved To', PRED_CSV)


In [None]:
import joblib
from transformers import AutoTokenizer
import os
import torch


# ===========================
# Define where to save model
# ===========================
save_dir = input("Enter directory path to save model: ").strip()
if not save_dir:
    save_dir = "./trained_model"
os.makedirs(save_dir, exist_ok=True)


# ===========================
# Update MODEL_NAME to point to saved model
# ===========================
MODEL_PATH_NAME = f"{save_dir}/model.pkl" # Changed extension to .pth
TOKENISER_PATH_NAME =  f"{save_dir}/tokenizer" # Saved as directory
SCALER_PATH_NAME = f"{save_dir}/numeric_scaler.pkl"
# ===========================
# Save model
# ===========================
if 'model' not in globals():
    raise ValueError("Trained model object 'model' not found in memory. Please run training first.")
# Use torch.save to save the model's state_dict
torch.save(model, MODEL_PATH_NAME)

# ===========================
# Ensure tokenizer exists (prefer the one used in training)
# ===========================
if 'TOKENIZER' in globals():
    tokenizer = TOKENIZER
elif 'tokenizer' in globals():
    tokenizer = tokenizer
else:
    # If no tokenizer object in memory, try loading from MODEL_NAME if defined
    if 'MODEL_NAME' in globals():
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    else:
        raise ValueError("No tokenizer found. Ensure you have trained or loaded a model before saving.")

# Save tokenizer from training
# Use save_pretrained for Hugging Face tokenizers
tokenizer.save_pretrained(TOKENISER_PATH_NAME)

# ===========================
# Ensure scaler exists
# ===========================
if 'scaler' not in globals():
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    if 'proc_df' in globals() and 'FEATURE_COLS' in globals() and len(FEATURE_COLS) > 0:
        scaler.fit(proc_df[FEATURE_COLS])  # Fit if data is available
    else:
        print("Warning: No numeric features found. Creating an unfitted scaler.")

# Save numeric scaler
joblib.dump(scaler, SCALER_PATH_NAME)


print(f" Model, tokenizer, and scaler saved to: {save_dir}")
print(f" You can now load them with: AutoTokenizer.from_pretrained('{save_dir}')") # Note: Loading the model requires instantiating the class and loading the state_dict