In [1]:
# fix split 
import pandas as pd
import numpy as np

# Extract training set from MNIST dataset.
fashion_train_df = pd.read_csv('/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/data_copy/raw/fashion-mnist-train.csv')
fashion_test_df = pd.read_csv('/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/data_copy/raw/fashion-mnist-test.csv')

np.random.seed(0)

# Suppose you have N samples
N = len(fashion_train_df)
indices = np.random.permutation(N)  # shuffled indices, reproducible because of seed

# Calculate absolute sizes
n_train = int(0.50 * N)   # 50%
n_val = int(0.10 * N)     # 10%
n_comb = N - (n_train + n_val)  # remaining 40%

# Assign splits
idx_train = indices[:n_train]
idx_val = indices[n_train:n_train + n_val]
idx_comb = indices[n_train + n_val:]

# Get DataFrames
train_df = fashion_train_df.iloc[idx_train]
val_df = fashion_train_df.iloc[idx_val]
comb_df = fashion_train_df.iloc[idx_comb]

print('train frame: ',len(train_df))
print('validation frame: ', len(val_df))
print('combination frame: ',len(comb_df))
print('test frame: ', len(fashion_test_df))


train frame:  30000
validation frame:  6000
combination frame:  24000
test frame:  10000


In [2]:
def hide_continuous(df, hide=[0.25, 0.75], hidden_value=0):
    """
    Hides a percentage of pixels in a dataframe 
    and returns two dataframes with different masking levels.
    """
    np.random.seed(42)
    labels = df.iloc[:, 0].to_numpy()
    original_pixels = df.iloc[:, 1:].to_numpy(copy=True)
    n_rows, n_pixels = original_pixels.shape
    
    hide_first_ratio = hide[0]
    hide_second_ratio = hide[1]
    n_hide_first = int(n_pixels * hide_first_ratio)
    n_hide_second = int(n_pixels * hide_second_ratio)

    all_drop_indices = np.random.choice(n_pixels, size=n_hide_second, replace=False)
    drop_indices_first = all_drop_indices[:n_hide_first]
    drop_indices_second = all_drop_indices

    # first mask
    pixels_first = original_pixels.copy()
    pixels_first[:, drop_indices_first] = hidden_value
    df_first = pd.DataFrame(pixels_first, columns=df.columns[1:], index=df.index)
    df_first.insert(0, 'label', labels)

    # second mask
    pixels_second = original_pixels.copy()
    pixels_second[:, drop_indices_second] = hidden_value
    df_second = pd.DataFrame(pixels_second, columns=df.columns[1:], index=df.index)
    df_second.insert(0, 'label', labels)

    return df_first, df_second


In [3]:
# Prepare full and dropout versions
train_full = train_df.copy()
test_full = fashion_test_df.copy()

train_drop15, train_drop85 = hide_continuous(train_df, hide=[0.15, 0.85], hidden_value=0)
test_drop15, test_drop85 = hide_continuous(fashion_test_df, hide=[0.15, 0.85], hidden_value=0)


# Bayesian Logistic Regression

In [None]:
import pymc as pm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

def run_blr(train_df, test_df, pca_components=100, random_state=42):
    # Prepare data
    X_train = train_df.iloc[:, 1:]
    y_train = train_df["label"]
    X_test = test_df.iloc[:, 1:]
    y_test = test_df["label"]

    X_tr = X_train.to_numpy(dtype=np.float32)
    X_te = X_test.to_numpy(dtype=np.float32)
    y_tr = y_train.to_numpy()
    y_te = y_test.to_numpy()

    # Standardize
    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)
    X_te_s = scaler.transform(X_te)

    # PCA
    pca = PCA(n_components=pca_components, random_state=random_state, svd_solver='randomized')
    X_tr_s = pca.fit_transform(X_tr_s)
    X_te_s = pca.transform(X_te_s)

    n_features = X_tr_s.shape[1]
    n_classes = len(np.unique(y_tr))

    classes, y_tr_enc = np.unique(y_tr, return_inverse=True)
    class_to_index = {c: i for i, c in enumerate(classes)}
    y_te_enc = np.vectorize(class_to_index.get)(y_te)

    X_tr_s = np.nan_to_num(X_tr_s)
    X_te_s = np.nan_to_num(X_te_s)
    X_tr_s = np.clip(X_tr_s, -10.0, 10.0)
    X_te_s = np.clip(X_te_s, -10.0, 10.0)

    print(f"Running BLR: n_features={n_features}, n_classes={n_classes}")

    # Bayesian Logistic Regression 
    with pm.Model() as blr_model:
        X_data = pm.Data("X_tr", X_tr_s.astype("float32"))
        y_data = pm.Data("y_tr", y_tr_enc.astype("int32"))

        W = pm.Normal("W", mu=0.0, sigma=0.3, shape=(n_features, n_classes))
        b = pm.Normal("b", mu=0.0, sigma=0.3, shape=(n_classes,))

        logits = pm.math.dot(X_data, W) + b
        logits = pm.math.clip(logits, -20.0, 20.0)
        p = pm.math.softmax(logits, axis=1)

        y_obs = pm.Categorical("y_obs", p=p, observed=y_data)

        approx = pm.fit(
            n=3000,
            method="advi",
            obj_optimizer=pm.adamax(learning_rate=5e-3),
            callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-3)],
            random_seed=random_state,
            progressbar=True
        )
        trace = approx.sample(300, random_seed=random_state)

    # Posterior means
    W_mean = trace.posterior["W"].mean(dim=("chain", "draw")).values
    b_mean = trace.posterior["b"].mean(dim=("chain", "draw")).values

    # Predict
    logits_te = X_te_s.astype("float32") @ W_mean + b_mean
    logits_te = np.clip(logits_te, -20.0, 20.0)
    p_te = np.exp(logits_te - logits_te.max(axis=1, keepdims=True))
    p_te /= p_te.sum(axis=1, keepdims=True)
    y_pred = p_te.argmax(axis=1)

    print("Accuracy:", accuracy_score(y_te_enc, y_pred))
    print(classification_report(y_te_enc, y_pred))
    print(confusion_matrix(y_te_enc, y_pred))

    return scaler, pca, W_mean, b_mean, classes


In [5]:
scaler_full, pca_full, W_full, b_full, classes_full = run_blr(train_full, test_full)


Running BLR: n_features=100, n_classes=10


Output()

Finished [100%]: Average Loss = 34,419


Accuracy: 0.6345
              precision    recall  f1-score   support

           0       0.50      0.98      0.66      1000
           1       0.90      0.96      0.93      1000
           2       0.38      0.88      0.53      1000
           3       0.77      0.63      0.70      1000
           4       0.33      0.03      0.06      1000
           5       0.64      0.92      0.75      1000
           6       0.12      0.01      0.01      1000
           7       0.70      0.74      0.72      1000
           8       0.96      0.67      0.79      1000
           9       0.98      0.53      0.69      1000

    accuracy                           0.63     10000
   macro avg       0.63      0.63      0.58     10000
weighted avg       0.63      0.63      0.58     10000

[[976   3   8   4   0   3   3   0   3   0]
 [ 28 962   1   9   0   0   0   0   0   0]
 [ 99   9 875   7   4   1   4   0   1   0]
 [280  57  28 634   0   1   0   0   0   0]
 [ 33  19 791 121  33   0   0   1   1   1]
 [ 21   6

In [6]:
scaler_drop, pca_drop, W_drop, b_drop, classes_drop = run_blr(train_drop15, test_drop15)


Running BLR: n_features=100, n_classes=10


Output()

Finished [100%]: Average Loss = 34,158


Accuracy: 0.646
              precision    recall  f1-score   support

           0       0.52      0.97      0.68      1000
           1       0.91      0.96      0.94      1000
           2       0.39      0.88      0.54      1000
           3       0.75      0.65      0.70      1000
           4       0.37      0.04      0.07      1000
           5       0.65      0.93      0.76      1000
           6       0.24      0.01      0.02      1000
           7       0.72      0.76      0.74      1000
           8       0.96      0.68      0.80      1000
           9       0.98      0.57      0.72      1000

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.60     10000
weighted avg       0.65      0.65      0.60     10000

[[974   1   8   6   0   4   3   0   4   0]
 [ 27 962   1  10   0   0   0   0   0   0]
 [ 90   9 880  11   2   3   3   0   2   0]
 [266  52  27 653   1   1   0   0   0   0]
 [ 29  12 784 133  39   0   0   1   1   1]
 [ 16   6 

In [7]:
from sklearn.model_selection import train_test_split

X_comb = comb_df.drop("label", axis=1).values
y_comb = comb_df["label"].values

X_comb_train, X_comb_test, y_comb_train, y_comb_test = train_test_split(
    X_comb, y_comb, test_size=0.20, stratify=y_comb, random_state=0)


In [9]:
def predictions_combined_dataset(X_data, y_data, scaler, pca, W, b, classes, prefix="blr"):
    X_scaled = scaler.transform(X_data)
    X_pca = pca.transform(X_scaled)
    logits = X_pca @ W + b
    logits = np.clip(logits, -20.0, 20.0)
    probs = np.exp(logits - logits.max(axis=1, keepdims=True))
    probs /= probs.sum(axis=1, keepdims=True)
    df = pd.DataFrame(probs, columns=[f"{prefix}_class_{i}" for i in range(probs.shape[1])])
    df["label"] = y_data
    return df


In [10]:
# Full-image BLR predictions
blr_comb_train_df = predictions_combined_dataset(X_comb_train, y_comb_train, scaler_full, pca_full, W_full, b_full, classes_full, prefix="blr_full")
blr_comb_test_df  = predictions_combined_dataset(X_comb_test,  y_comb_test,  scaler_full, pca_full, W_full, b_full, classes_full, prefix="blr_full")

blr_comb_train_df.to_csv("blr_full_comb_train_predictions.csv", index=False)
blr_comb_test_df.to_csv("blr_full_comb_test_predictions.csv", index=False)

# Dropout-image BLR predictions
blr_dropout_comb_train_df = predictions_combined_dataset(X_comb_train, y_comb_train, scaler_drop, pca_drop, W_drop, b_drop, classes_drop, prefix="blr_drop")
blr_dropout_comb_test_df  = predictions_combined_dataset(X_comb_test,  y_comb_test,  scaler_drop, pca_drop, W_drop, b_drop, classes_drop, prefix="blr_drop")

blr_dropout_comb_train_df.to_csv("blr_dropout_comb_train_predictions.csv", index=False)
blr_dropout_comb_test_df.to_csv("blr_dropout_comb_test_predictions.csv", index=False)
