In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [12]:
def intrain(data_path='DATA.csv'):
    print("Starting intrain() – training Independent + Conjunct MLPs...")
    
    df = pd.read_csv(data_path)
    print(f"Data loaded: {df.shape}")
    
    # === PREPROCESSING ===
    label_encoders = {}
    for col in df.columns:
        le = LabelEncoder()
        le.fit(df[col])
        label_encoders[col] = le
    
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe.fit(df.values.flatten().reshape(-1, 1))
    BLOCK_SIZE = len(ohe.categories_[0])  # 16
    
    # Encode full dataset
    encoded = pd.DataFrame({col: label_encoders[col].transform(df[col]) for col in df.columns})
    tensor = torch.tensor(encoded.values, dtype=torch.long)
    full_onehot = torch.nn.functional.one_hot(tensor, BLOCK_SIZE).view(-1, BLOCK_SIZE*16).float()
    
    # === TRAIN 16 INDEPENDENT MLPs ===
    print("\nTraining 16 Independent MLPs...")
    independent_models = {}
    for col_idx in range(16):
        col_name = df.columns[col_idx]
        print(f"  → Training {col_name} ({col_idx+1}/16)")
        
        X = full_onehot.clone()
        X[:, col_idx*BLOCK_SIZE:(col_idx+1)*BLOCK_SIZE] = 0.0
        y = tensor[:, col_idx]
        
        dataset = TensorDataset(X, y)
        loader = DataLoader(dataset, batch_size=128, shuffle=True, pin_memory=True)
        
        model = nn.Sequential(
            nn.Linear(BLOCK_SIZE*16, 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 64),  nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, BLOCK_SIZE)
        ).to(device)
        
        opt = optim.Adam(model.parameters(), lr=0.001)
        crit = nn.CrossEntropyLoss()
        
        model.train()
        for epoch in tqdm(range(40), desc=col_name, leave=False):
            for bx, by in loader:
                bx, by = bx.to(device), by.to(device)
                opt.zero_grad()
                loss = crit(model(bx), by)
                loss.backward()
                opt.step()
        
        independent_models[col_name] = model.cpu()
    
    # === TRAIN CONJUNCT MLP ===
    print("\nTraining Conjunct MLP (autoencoder)...")
    conjunct_model = nn.Sequential(
        nn.Linear(BLOCK_SIZE*16, 256), nn.ReLU(), nn.Dropout(0.3),
        nn.Linear(256, 256), nn.ReLU(), nn.Dropout(0.3),
        nn.Linear(256, BLOCK_SIZE*16)
    ).to(device)
    
    opt = optim.Adam(conjunct_model.parameters(), lr=0.001)
    crit = nn.MSELoss()
    
    conjunct_model.train()
    for epoch in tqdm(range(80), desc="Conjunct", leave=False):
        for i in range(0, len(full_onehot), 256):
            batch = full_onehot[i:i+256].to(device)
            opt.zero_grad()
            loss = crit(conjunct_model(batch), batch)
            loss.backward()
            opt.step()
    
    conjunct_model = conjunct_model.cpu()
    
    # === SAVE EVERYTHING ===
    torch.save({
        'independent_models': independent_models,
        'conjunct_model': conjunct_model,
        'label_encoders': label_encoders,
        'onehot_encoder': ohe,
        'block_size': BLOCK_SIZE
    }, 'models_complete.pth')
    
    print("\nINTRAIN COMPLETE → 'models_complete.pth' saved")
    return 'models_complete.pth'

# RUN ONCE
model_file = intrain()

Starting intrain() – training Independent + Conjunct MLPs...
Data loaded: (19900, 16)

Training 16 Independent MLPs...
  → Training f1 (1/16)


f1:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f2 (2/16)


f2:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f3 (3/16)


f3:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f4 (4/16)


f4:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f5 (5/16)


f5:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f6 (6/16)


f6:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f7 (7/16)


f7:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f8 (8/16)


f8:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f9 (9/16)


f9:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f10 (10/16)


f10:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f11 (11/16)


f11:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f12 (12/16)


f12:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f13 (13/16)


f13:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f14 (14/16)


f14:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f15 (15/16)


f15:   0%|          | 0/40 [00:00<?, ?it/s]

  → Training f16 (16/16)


f16:   0%|          | 0/40 [00:00<?, ?it/s]


Training Conjunct MLP (autoencoder)...


Conjunct:   0%|          | 0/80 [00:00<?, ?it/s]


INTRAIN COMPLETE → 'models_complete.pth' saved


In [16]:
def test_and_compare(model_path='models_complete.pth'):
    print("Testing Independent vs Conjunct MLP...")
    
    # FIXED: Load with weights_only=False (you trained this file)
    data = torch.load(model_path, weights_only=False)
    indep_models = data['independent_models']
    conj_model = data['conjunct_model']
    le = data['label_encoders']
    BLOCK_SIZE = data['block_size']
    
    df = pd.read_csv('DATA.csv')
    _, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Encode test
    encoded = pd.DataFrame({col: le[col].transform(test_df[col]) for col in test_df.columns})
    tensor = torch.tensor(encoded.values, dtype=torch.long)
    test_onehot = torch.nn.functional.one_hot(tensor, BLOCK_SIZE).view(-1, BLOCK_SIZE*16).float()
    
    # === GLOBAL IMPUTER ===
    def impute_independent(X_miss, col_names):
        X = X_miss.clone()
        with torch.no_grad():
            for col_idx in reversed(range(16)):
                col_name = col_names[col_idx]
                start = col_idx * BLOCK_SIZE
                end = start + BLOCK_SIZE
                missing = X[:, start:end].sum(dim=1) == 0
                if missing.any():
                    inp = X.to(device)
                    logits = indep_models[col_name].to(device)(inp)[missing]
                    probs = torch.softmax(logits / 0.5, dim=1)
                    pred = torch.argmax(probs, dim=1)
                    onehot = torch.nn.functional.one_hot(pred, BLOCK_SIZE).float()
                    X[missing, start:end] = onehot.cpu()
            for col_idx in range(16):
                col_name = col_names[col_idx]
                start = col_idx * BLOCK_SIZE
                end = start + BLOCK_SIZE
                missing = X[:, start:end].sum(dim=1) == 0
                if missing.any():
                    inp = X.to(device)
                    logits = indep_models[col_name].to(device)(inp)[missing]
                    pred = torch.argmax(logits, dim=1)
                    onehot = torch.nn.functional.one_hot(pred, BLOCK_SIZE).float()
                    X[missing, start:end] = onehot.cpu()
        return X
    
    def impute_conjunct(X_miss, iters=7):
        X = X_miss.clone()
        conj_model.to(device)
        conj_model.eval()
        with torch.no_grad():
            for _ in range(iters):
                X = X.to(device)
                X = conj_model(X)
                X = torch.clamp(X, 0, 1)
                X = X / (X.sum(dim=1, keepdim=True) + 1e-8)
                X = X.cpu()
        return X
    
    def create_missing(rate=0.1, seed=42):
        torch.manual_seed(seed)
        mask = torch.rand(test_onehot.shape) < rate
        X_miss = test_onehot.clone()
        X_miss[mask] = 0.0
        return X_miss, mask
    
    def accuracy(imputed, mask):
        feature_mask = mask.view(-1, 16, BLOCK_SIZE).sum(dim=2) > 0
        pred = torch.argmax(imputed.view(-1, 16, BLOCK_SIZE), dim=2)
        true = torch.argmax(test_onehot.view(-1, 16, BLOCK_SIZE), dim=2)
        return (pred[feature_mask] == true[feature_mask]).float().mean().item()
    
    # Run
    rates = [0.05, 0.1, 0.2]
    results = []
    col_names = test_df.columns.tolist()
    
    for rate in rates:
        X_miss, mask = create_missing(rate)
        X_indep = impute_independent(X_miss, col_names)
        X_conj = impute_conjunct(X_miss)
        acc_indep = accuracy(X_indep, mask)
        acc_conj = accuracy(X_conj, mask)
        results.append({
            'missing_rate': rate,
            'independent_best': round(acc_indep, 4),
            'conjunct_7iters': round(acc_conj, 4)
        })
    
    df_results = pd.DataFrame(results)
    print("\nFINAL COMPARISON – INDEPENDENT WINS:")
    print(df_results)
    df_results.to_csv('final_accuracy_comparison.csv', index=False)
    print("final_accuracy_comparison.csv saved")
    
    return df_results, impute_independent

# RUN IT
results_df, global_imputer = test_and_compare()

Testing Independent vs Conjunct MLP...

FINAL COMPARISON – INDEPENDENT WINS:
   missing_rate  independent_best  conjunct_7iters
0          0.05            0.9765           0.2241
1          0.10            0.9640           0.2239
2          0.20            0.9238           0.2238
final_accuracy_comparison.csv saved


In [None]:
def query(input_path='In?ut.csv', output_path='Imputed_Output.csv'):
    print(f"QUERY: Imputing {input_path} → {output_path}")
    
    
    data = torch.load('models_complete.pth', weights_only=False)
    indep_models = data['independent_models']
    le = data['label_encoders']
    BLOCK_SIZE = data['block_size']
    
    input_df = pd.read_csv(input_path)
    print(f"Input shape: {input_df.shape}")
    
    encoded = pd.DataFrame()
    for col in input_df.columns:
        col_data = input_df[col].fillna('A')
        col_data = col_data.astype(str)
        col_data = col_data.apply(lambda x: x if x in le[col].classes_ else le[col].classes_[0])
        encoded[col] = le[col].transform(col_data)
    
    tensor = torch.tensor(encoded.values, dtype=torch.long)
    X_miss = torch.nn.functional.one_hot(tensor, BLOCK_SIZE).view(-1, BLOCK_SIZE*16).float()
    
    for col_idx, col in enumerate(input_df.columns):
        missing = input_df[col].isna() | (input_df[col].astype(str) == '') | (input_df[col].astype(str) == '?')
        if missing.any():
            start = col_idx * BLOCK_SIZE
            end = start + BLOCK_SIZE
            X_miss[missing, start:end] = 0.0
    
    X_filled = global_imputer(X_miss, input_df.columns.tolist())
    
    labels = torch.argmax(X_filled.view(-1, 16, BLOCK_SIZE), dim=2)
    result = pd.DataFrame()
    for i, col in enumerate(input_df.columns):
        result[col] = le[col].inverse_transform(labels[:, i].cpu().numpy())
    
    result.to_csv(output_path, index=False)
    print(f"IMPUTED FILE SAVED → {output_path}")
    display(result.head())
    return result

# query('In?ut.csv', 'MY_SUBMISSION.csv')