This dataset generation code is written using the hints given in the project scope document, and the proof from the reference paper attached along.

In [None]:
import numpy as np
import pandas as pd
import itertools
from scipy.optimize import linprog

'''
This function is used to compute m-height and generate the dataset
The code here just checks if it is working fine for each combination of (n, k, m)
As this optimization takes GPU operations, to generate data for 1M, a modified version of this code was run on HPRC
'''

def m_height_linprog_exact(P, n, k, m):

    # Constructing generator matrix G = [I_k | P]
    G = np.hstack((np.eye(k), P))
    indices = list(range(n))

    # All the possible psi vectors of length m with their entries ±1
    psi_set = list(itertools.product([-1, 1], repeat=m))

    # This is the max m-height value, initially 0
    h_m_C = 0


    '''
    This nested function is used to solve the LP optimization of m-height for
    a fixed (a, b, X, psi)
    '''

    def solve_LP(a, b, X, psi):

        # Y is the set of remaining coordinates which are not in X, a, or b
        Y = list(set(indices) - set(X) - {a, b})

        # For psi index mapping, order of coordinates is: a, X, Y, b
        thou = [a] + sorted(list(X)) + Y + [b]

        # Objective funcion is: maximize psi[0] * <G[:,a], v> where v is a vector in R^k
        # Using -c here to reverse the objective, as linprog minimizes
        c = np.array([psi[0] * G[i, a] for i in range(k)])

        # The inequality constraints
        A_ub, b_ub = [], []

        # Following are the constraints for j in X
        for j in X:
            # first
            A_ub.append([psi[thou.index(j)] * G[i, j] - psi[0] * G[i, a] for i in range(k)])
            b_ub.append(0)
            # second
            A_ub.append([-psi[thou.index(j)] * G[i, j] for i in range(k)])
            b_ub.append(-1)

        # Following are the constraints for j in Y
        for j in Y:
            #The bound is: |<G[:,j], v>| <=1
            A_ub.append([G[i, j] for i in range(k)])
            b_ub.append(1)
            A_ub.append([-G[i, j] for i in range(k)])
            b_ub.append(1)

        # Equality constraint: <G[:,b], v> =1
        A_eq = [[G[i, b] for i in range(k)]]
        b_eq = [1]

        bounds = [(None, None)] * k
        res = linprog(-c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')

        # If success then return the optimal value, else return 0
        return -res.fun if res.success else 0

    # Going over all the valid pairs (a, b)
    for a, b in itertools.permutations(indices, 2):
        # Going over all (m-1)-size subsets X excluding a, b
        for X in itertools.combinations(set(indices) - {a, b}, m - 1):
            # Going over all ±1 vectors of length m
            for psi in psi_set:
                z_val = solve_LP(a, b, X, psi)
                # Always storing the maximum height so far.
                h_m_C = max(h_m_C, z_val)
    return h_m_C

# List of 21 (n, k, m) combinations in the project scope
combinations = [
    (9, 4, 2), (9, 4, 3), (9, 4, 4), (9, 4, 5), (9, 5, 2), (9, 5, 3), (9, 5, 4),
    (9, 6, 2), (9, 6, 3), (10, 4, 2), (10, 4, 3), (10, 4, 4), (10, 4, 5), (10, 4, 6),
    (10, 5, 2), (10, 5, 3), (10, 5, 4), (10, 5, 5), (10, 6, 2), (10, 6, 3), (10, 6, 4)
]

# Generating and storing data
data = []
for i, (n, k, m) in enumerate(combinations, 1):
    # P part of the G matrix is generated and value of P ranges between [-100,100]
    P = np.random.uniform(-100, 100, size=(k, n - k))
    h_m_C = m_height_linprog_exact(P, n, k, m)
    data.append((n, k, m, h_m_C, P))
    print(f"[{i}/{len(combinations)}] Computed h_m(C) for (n={n}, k={k}, m={m}) -> {h_m_C}")

# Saving as a DataFrame
df = pd.DataFrame(data, columns=['n', 'k', 'm', 'h_m_C', 'P'])
df.to_pickle("m_height_dataset.pkl")
print("Dataset saved successfully!")


[1/21] Computed h_m(C) for (n=9, k=4, m=2) -> 144.90346620425922
[2/21] Computed h_m(C) for (n=9, k=4, m=3) -> 242.89075886905553
[3/21] Computed h_m(C) for (n=9, k=4, m=4) -> 1330.2166227995049
[4/21] Computed h_m(C) for (n=9, k=4, m=5) -> 4086.4253934219873
[5/21] Computed h_m(C) for (n=9, k=5, m=2) -> 321.2006862406292
[6/21] Computed h_m(C) for (n=9, k=5, m=3) -> 884.4806452981419
[7/21] Computed h_m(C) for (n=9, k=5, m=4) -> 10349.177498422843
[8/21] Computed h_m(C) for (n=9, k=6, m=2) -> 898.8751718194876
[9/21] Computed h_m(C) for (n=9, k=6, m=3) -> 39566.99255978265
[10/21] Computed h_m(C) for (n=10, k=4, m=2) -> 142.99082974010506
[11/21] Computed h_m(C) for (n=10, k=4, m=3) -> 158.20899105501928
[12/21] Computed h_m(C) for (n=10, k=4, m=4) -> 325.48136135587765
[13/21] Computed h_m(C) for (n=10, k=4, m=5) -> 3390.269358512461
[14/21] Computed h_m(C) for (n=10, k=4, m=6) -> 64123.66800799458
[15/21] Computed h_m(C) for (n=10, k=5, m=2) -> 178.21142924308538
[16/21] Computed h_

In [None]:
import numpy as np
import pandas as pd
import itertools
from scipy.optimize import linprog
import os
import joblib
from random import choice

'''
This function is used to compute m-height and generate the dataset
The code runs 1M time across combinations to generate the dataset.
This was run on HPRC using a SLURM job.
'''

def m_height_linprog_exact(P, n, k, m):

    # Constructing generator matrix G = [I_k | P]
    G = np.hstack((np.eye(k), P))
    indices = list(range(n))

    # All the possible psi vectors of length m with their entries ±1
    psi_set = list(itertools.product([-1, 1], repeat=m))

    # This is the max m-height value, initially 0
    h_m_C = 0


    '''
    This nested function is used to solve the LP optimization of m-height for
    a fixed (a, b, X, psi)
    '''

    def solve_LP(a, b, X, psi):

        # Y is the set of remaining coordinates which are not in X, a, or b
        Y = list(set(indices) - set(X) - {a, b})

        # For psi index mapping, order of coordinates is: a, X, Y, b
        thou = [a] + sorted(list(X)) + Y + [b]

        # Objective funcion is: maximize psi[0] * <G[:,a], v> where v is a vector in R^k
        # Using -c here to reverse the objective, as linprog minimizes
        c = np.array([psi[0] * G[i, a] for i in range(k)])

        # The inequality constraints
        A_ub, b_ub = [], []

        # Following are the constraints for j in X
        for j in X:
            # first
            A_ub.append([psi[thou.index(j)] * G[i, j] - psi[0] * G[i, a] for i in range(k)])
            b_ub.append(0)
            # second
            A_ub.append([-psi[thou.index(j)] * G[i, j] for i in range(k)])
            b_ub.append(-1)

        # Following are the constraints for j in Y
        for j in Y:
            #The bound is: |<G[:,j], v>| <=1
            A_ub.append([G[i, j] for i in range(k)])
            b_ub.append(1)
            A_ub.append([-G[i, j] for i in range(k)])
            b_ub.append(1)

        # Equality constraint: <G[:,b], v> =1
        A_eq = [[G[i, b] for i in range(k)]]
        b_eq = [1]

        bounds = [(None, None)] * k
        res = linprog(-c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs')

        # If success then return the optimal value, else return
        return -res.fun if res.success else 0

    # Going over all the valid pairs (a, b)
    for a, b in itertools.permutations(indices, 2):
        # Going over all (m-1)-size subsets X excluding a, b
        for X in itertools.combinations(set(indices) - {a, b}, m - 1):
            # Going over all ±1 vectors of length m
            for psi in psi_set:
                z_val = solve_LP(a, b, X, psi)
                # Always storing the maximum height so far.
                h_m_C = max(h_m_C, z_val)
    return h_m_C

# List of 21 (n, k, m) combinations in the project scope
combinations = [
    (9, 4, 2), (9, 4, 3), (9, 4, 4), (9, 4, 5), (9, 5, 2), (9, 5, 3), (9, 5, 4),
    (9, 6, 2), (9, 6, 3), (10, 4, 2), (10, 4, 3), (10, 4, 4), (10, 4, 5), (10, 4, 6),
    (10, 5, 2), (10, 5, 3), (10, 5, 4), (10, 5, 5), (10, 6, 2), (10, 6, 3), (10, 6, 4)
]

# Generating and storing data
data = []

# Generating total 1M samples randomly across all combinations
total_samples = 1_000_000

for i in enumerate(total_samples):
    # Randomly choosing a combination
    n, k, m = choice(combinations)
    # P part of the G matrix is generated and value of P ranges between [-100,100]
    P = np.random.uniform(-100, 100, size=(k, n - k))
    h_m_C = m_height_linprog_exact(P, n, k, m)
    data.append((n, k, m, h_m_C, P))
    # print(f"[{i}/{len(combinations)}] Computed h_m(C) for (n={n}, k={k}, m={m}) -> {h_m_C}")

# Saving as a DataFrame in my directory
df = pd.DataFrame(data, columns=['n', 'k', 'm', 'h_m_C', 'P'])
output_dir = os.environ.get("SCRATCH") + "/dipanwita22rano/dlproject"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "m_height_dataset.pkl")
with open(output_path, 'wb') as f:
    joblib.dump(df, f)
print("Dataset saved successfully!")


In [None]:
import joblib
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/m_height_dataset.pkl'
with open(file_path, 'rb') as f:
    df = joblib.load(f)
output_path = '/content/drive/My Drive/m_height_dataset_.pkl'
with open(output_path, 'wb') as f:
    joblib.dump(df, f)
print(df.head(5))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
    n  k  m         h_m_C                                                  P
0  10  4  3    180.681884  [[0.41332842078512044, 70.03763016732799, 92.9...
1   9  6  3  29425.835225  [[-22.39439891822049, -45.59568035361643, 84.5...
2  10  4  4    590.143407  [[-98.04865191290197, 55.176558968450024, -47....
3   9  6  3   6838.797140  [[-72.01756032528372, -96.84222090558315, -97....
4   9  6  3   5219.964621  [[86.98714113704173, 62.11282763418615, -49.67...


Better Architecture Experiment - Using the same Transformer model as in previous projects, but using more regularization here, and epochs has been increased.

* It was observed that loss values increased with larger m, making higher-m groups harder to predict accurately.

* Motivation: introduce stronger regularization to models corresponding to larger m values to stabilize training and prevent overfitting on noisy or more complex data.

* L2 regularization was added to the Dense layers to encourage smoother predictions especially where learning became more unstable for high m.

* Although regularization improved stability, it sometimes caused slightly worse predictive performance (higher RMSLE) due to the reduced model flexibility.

* Overall, this trade-off aimed to prioritize more consistent behavior across varying m values rather than overfitting the easier (small m) cases only.

But finally even this experiment failed, with slightly poorer results compared to the previous submitted model.

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from tensorflow import keras
from keras import layers, regularizers
import tensorflow as tf

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/m_height_dataset.pkl'
with open(file_path, 'rb') as f:
    df = joblib.load(f)

df['k_val'] = df['P'].apply(lambda x: x.shape[0])
max_k = df['k_val'].max()
print(f"Maximum k (rows in P): {max_k}")

'''
  Dataset Split
  Test = 15% of df
  Train = 85% * (85%) of df
  Validation = 15% * (85%) of df
'''
train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.15, random_state=42)

'''
  Introducing a weightles loss in the preprocessing function
'''
def preprocess_group_with_weight(group_df):
    X_list, y_list, weights_list = [], [], []
    for _, row in group_df.iterrows():
        P = np.array(row['P']).astype(np.float32)
        n, k, m = row['n'], row['k'], row['m']
        meta = np.array([n, k, m], dtype=np.float32)

        row_features = []
        for i in range(P.shape[0]):
            pos_encoding = np.zeros(max_k)
            pos_encoding[i] = 1.0
            features = np.concatenate([P[i], meta, pos_encoding])
            row_features.append(features)

        X = np.array(row_features)
        X_list.append(X)
        y_list.append(np.log(row['h_m_C']))
        # Optional weight scaling here
        weights_list.append(1.0 / (m + 1))

    return np.array(X_list), np.array(y_list), np.array(weights_list)

# Same old Transformer model
def build_transformer_model(input_shape):
    inputs = keras.Input(shape=input_shape)
    x = layers.LayerNormalization()(inputs)
    x = layers.MultiHeadAttention(num_heads=2, key_dim=8)(x, x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse')
    return model

results = []
global_y_true = []
global_y_pred = []

unique_groups = sorted(train_df.groupby(['n', 'k', 'm']).groups.keys())
os.makedirs('/content/models', exist_ok=True)

for (n, k, m) in unique_groups:
    print(f"\nTraining model for (n={n}, k={k}, m={m})")

    train_group = train_df[(train_df['n'] == n) & (train_df['k'] == k) & (train_df['m'] == m)]
    val_group = val_df[(val_df['n'] == n) & (val_df['k'] == k) & (val_df['m'] == m)]
    test_group = test_df[(test_df['n'] == n) & (test_df['k'] == k) & (test_df['m'] == m)]

    X_train, y_train, weights_train = preprocess_group_with_weight(train_group)
    X_val, y_val, weights_val = preprocess_group_with_weight(val_group)
    X_test, y_test_log, weights_test = preprocess_group_with_weight(test_group)

    model = build_transformer_model(input_shape=X_train.shape[1:])
    model.fit(
        X_train, y_train,
        sample_weight=weights_train,
        validation_data=(X_val, y_val),
        epochs=80,
        batch_size=32,
        verbose=0,
        callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
    )

    model_path = f'/content/models/model_n{n}_k{k}_m{m}.h5'
    model.save(model_path)
    print(f"Model saved to {model_path}")

    y_pred_log = model.predict(X_test).flatten()
    y_pred = np.exp(y_pred_log)
    y_true = np.exp(y_test_log)

    y_pred = np.maximum(y_pred, 1.0)
    y_true = np.maximum(y_true, 1.0)

    global_y_pred.extend(y_pred)
    global_y_true.extend(y_true)

    y_pred_log2 = np.log2(y_pred)
    y_true_log2 = np.log2(y_true)
    rmsle = np.sqrt(mean_squared_log_error(y_true_log2, y_pred_log2))
    results.append(((n, k, m), rmsle))
    print(f"RMSLE (log base 2): {rmsle:.4f}")

# Evaluation metrics globally
global_y_pred = np.array(global_y_pred)
global_y_true = np.array(global_y_true)

global_y_pred_log2 = np.log2(global_y_pred)
global_y_true_log2 = np.log2(global_y_true)

global_cost = ((global_y_pred_log2 - global_y_true_log2) ** 2).mean()
global_rmsle = np.sqrt(mean_squared_log_error(global_y_true_log2, global_y_pred_log2))

print("\n=== Summary of Group-wise RMSLE ===")
for (n, k, m), rmsle in results:
    print(f"Group (n={n}, k={k}, m={m}): RMSLE (log base 2) = {rmsle:.4f}")

print("\n=== Global Evaluation ===")
print(f"Global Cost (σ) = {global_cost:.4f}")
print(f"Global RMSLE (log base 2) = {global_rmsle:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Maximum k (rows in P): 6

Training model for (n=9, k=4, m=2)




Model saved to /content/models/model_n9_k4_m2.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0538

Training model for (n=9, k=4, m=3)




Model saved to /content/models/model_n9_k4_m3.h5
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0520

Training model for (n=9, k=4, m=4)




Model saved to /content/models/model_n9_k4_m4.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0829

Training model for (n=9, k=4, m=5)




Model saved to /content/models/model_n9_k4_m5.h5
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1156

Training model for (n=9, k=5, m=2)




Model saved to /content/models/model_n9_k5_m2.h5
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0508

Training model for (n=9, k=5, m=3)




Model saved to /content/models/model_n9_k5_m3.h5
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0802

Training model for (n=9, k=5, m=4)




Model saved to /content/models/model_n9_k5_m4.h5
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1101

Training model for (n=9, k=6, m=2)




Model saved to /content/models/model_n9_k6_m2.h5
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0741

Training model for (n=9, k=6, m=3)




Model saved to /content/models/model_n9_k6_m3.h5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1151

Training model for (n=10, k=4, m=2)




Model saved to /content/models/model_n10_k4_m2.h5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.1384

Training model for (n=10, k=4, m=3)




Model saved to /content/models/model_n10_k4_m3.h5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0370

Training model for (n=10, k=4, m=4)




Model saved to /content/models/model_n10_k4_m4.h5
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0550

Training model for (n=10, k=4, m=5)




Model saved to /content/models/model_n10_k4_m5.h5
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0825

Training model for (n=10, k=4, m=6)




Model saved to /content/models/model_n10_k4_m6.h5
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1110

Training model for (n=10, k=5, m=2)




Model saved to /content/models/model_n10_k5_m2.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0387

Training model for (n=10, k=5, m=3)




Model saved to /content/models/model_n10_k5_m3.h5
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0584

Training model for (n=10, k=5, m=4)




Model saved to /content/models/model_n10_k5_m4.h5
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0792

Training model for (n=10, k=5, m=5)




Model saved to /content/models/model_n10_k5_m5.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.1049

Training model for (n=10, k=6, m=2)




Model saved to /content/models/model_n10_k6_m2.h5
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0502

Training model for (n=10, k=6, m=3)




Model saved to /content/models/model_n10_k6_m3.h5
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0766

Training model for (n=10, k=6, m=4)


Meta Learner Model - Experimental

This is an ensemble approach. It includes:
* A Transformer model was trained to predict log(h_m_C) from the processed input.
* A Ridge regression meta-learner was added on top of Transformer predictions.
* The Ridge model combines predictions (if multiple seeds/models are used) and acts as a second-stage smoother to correct Transformer biases or overfitting, especially where m-value is high.

Key Changes: Instead of using only one neural network, predictions are regularized and stabilized by training a simple linear model (Ridge) over Transformer outputs.


Even thiug it shows comparative results, still it fails to beat the transformer model used below.

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers, regularizers

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/m_height_dataset.pkl'
with open(file_path, 'rb') as f:
    df = joblib.load(f)

df['k_val'] = df['P'].apply(lambda x: x.shape[0])
max_k = df['k_val'].max()
print(f"Maximum k (rows in P): {max_k}")

'''
  Splitting the dataset here:
  Test = 15% of df
  Train = 85% * (85%) of df
  Validation = 15% * (85%) of df
'''
train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.15, random_state=42)


def preprocess_group(group_df):
    X_list, y_list = [], []
    for _, row in group_df.iterrows():
        P = np.array(row['P']).astype(np.float32)
        n, k, m = row['n'], row['k'], row['m']
        meta = np.array([n, k, m], dtype=np.float32)

        row_features = []
        for i in range(P.shape[0]):
            pos_encoding = np.zeros(max_k)
            pos_encoding[i] = 1.0
            features = np.concatenate([P[i], meta, pos_encoding])
            row_features.append(features)

        X = np.array(row_features)
        X_list.append(X)
        y_list.append(np.log(row['h_m_C']))

    return np.array(X_list), np.array(y_list)

# Building the initial transformer model
def build_transformer_model(input_shape):
    inputs = keras.Input(shape=input_shape)
    x = layers.LayerNormalization()(inputs)
    x = layers.MultiHeadAttention(num_heads=2, key_dim=8)(x, x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse')
    return model

unique_groups = sorted(train_df.groupby(['n', 'k', 'm']).groups.keys())
os.makedirs('/content/models', exist_ok=True)

# Storing one Ridge per group here
ridge_models = {}
global_true, global_pred = [], []

for (n, k, m) in unique_groups:
    print(f"\nTraining group (n={n}, k={k}, m={m})")

    train_group = train_df[(train_df['n'] == n) & (train_df['k'] == k) & (train_df['m'] == m)]
    val_group = val_df[(val_df['n'] == n) & (val_df['k'] == k) & (val_df['m'] == m)]
    test_group = test_df[(test_df['n'] == n) & (test_df['k'] == k) & (test_df['m'] == m)]

    if len(train_group) == 0 or len(val_group) == 0 or len(test_group) == 0:
        print("Skipping due to insufficient data.")
        continue

    X_train, y_train = preprocess_group(train_group)
    X_val, y_val = preprocess_group(val_group)
    X_test, y_test = preprocess_group(test_group)

    models = []
    meta_train, meta_val, meta_test = [], [], []

    # keeping seeds to 1
    for seed in range(1):
        model = build_transformer_model(X_train.shape[1:])
        model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=32,
            verbose=0,
            callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
        )
        models.append(model)

        model_save_path = f"/content/models/transformer_n{n}_k{k}_m{m}_seed{seed}.keras"
        model.save(model_save_path)
        print(f"Saved model: {model_save_path}")

        meta_train.append(model.predict(X_train).flatten())
        meta_val.append(model.predict(X_val).flatten())
        meta_test.append(model.predict(X_test).flatten())

    # Stacking predictions for each group
    meta_train = np.vstack(meta_train).T
    meta_val = np.vstack(meta_val).T
    meta_test = np.vstack(meta_test).T

    # Training the Ridge meta-learner on each group
    ridge = Ridge(alpha=1.0)
    ridge.fit(meta_train, y_train)

    ridge_save_path = f"/content/models/ridge_n{n}_k{k}_m{m}.joblib"
    joblib.dump(ridge, ridge_save_path)
    print(f"Saved Ridge model: {ridge_save_path}")

    pred_test = ridge.predict(meta_test)

    # Storing each group predictions
    global_true.append(np.exp(y_test))
    global_pred.append(np.exp(pred_test))
    ridge_models[(n, k, m)] = ridge


if len(global_true) > 0 and len(global_pred) > 0:
    global_true = np.concatenate(global_true)
    global_pred = np.concatenate(global_pred)

    global_pred = np.maximum(global_pred, 1.0)
    global_true = np.maximum(global_true, 1.0)

    global_rmsle = np.sqrt(mean_squared_log_error(np.log2(global_true), np.log2(global_pred)))
    global_cost = np.mean((np.log2(global_true) - np.log2(global_pred))**2)

    print(f"\nGlobal Ensemble RMSLE (log base 2): {global_rmsle:.4f}")
    print(f"Global Test Cost σ: {global_cost:.4f}")
else:
    print("\nNo models were trained due to insufficient data!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Maximum k (rows in P): 6

Training group (n=9, k=4, m=2)
Saved model: /content/models/transformer_n9_k4_m2_seed0.keras
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Saved Ridge model: /content/models/ridge_n9_k4_m2.joblib

Training group (n=9, k=4, m=3)
Saved model: /content/models/transformer_n9_k4_m3_seed0.keras
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Saved Ridge model: /content/models/ridge_n9_k4_m3.joblib

Training group (n=9, k=4, m=4)
Saved model: /content/models/transformer_n9_k4_m4_seed0.keras


This is the final model I came up with, with the lowest RMSLE amongst all other methods that i tried. This set of 21 models use transformer models, with one-hot encoding of row indices.

Other models that I had tried were:
* Single general model of feed forward architecture. It consisted of 4 layers, regularization, and an extensive hyperparameter tuning was done on it. Test RMSLE was way higher than validation.
* 21 models for each combination of (n, k, m) but all feed forward. Even though this reduced the RMSLE, but it predicted same values for a combination of (n, k, m). Basically this model only learnt a value for each combination, which would minimize the loss.
* As P is a matrix, I thought of using CNN, to have a 2D input. But this proved to be the most adverserial, having RMSLE even as high as 57. Thus, it proved that for m-height calculation, there is no spatial relationship between the matrix cell values.

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from tensorflow import keras
from keras import layers, regularizers
import tensorflow as tf

'''
  The dataset was saved to my google drive, so that I do not need to repeatedly
  upload it to my Colab runtime.
'''
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/m_height_dataset.pkl'
with open(file_path, 'rb') as f:
    df = joblib.load(f)

'''
  Computing the maximum number of rows in P for Positional Encoding.
  It is used to fix the size of one-hot encoding
  This gets the maximum value seen across all samples,
  so it must be 6 according to the project scope, else error in dataset generation.
'''
df['k_val'] = df['P'].apply(lambda x: x.shape[0])
max_k = df['k_val'].max()
print(f"Maximum k (rows in P): {max_k}")

'''
  Splitting the dataset here:
  Test = 15% of df
  Train = 85% * (85%) of df
  Validation = 15% * (85%) of df
'''
train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.15, random_state=42)

'''
  The following preprocessing function:
  Adds metadata (n, k, m) and positional encoding to each row of P
  I tested without adding metadata, as anyways making individual models for each combination
  But the results were adversarial with poor RMSLE values

  Output shape: (k, feature_dim) where feature_dim = (columns in P + 3 metadata + max_k one-hot)
  Target: log(h_m_C)
  I tested the target without its log value, that is, the actual m-height.
  But the RMSLE is better when the model is trained on log(m-height)
'''
def preprocess_group(group_df):
    X_list, y_list = [], []
    for _, row in group_df.iterrows():
        # P has the shape (k, n-k)
        P = np.array(row['P']).astype(np.float32)
        n, k, m = row['n'], row['k'], row['m']
        meta = np.array([n, k, m], dtype=np.float32)

        row_features = []
        for i in range(P.shape[0]):
            pos_encoding = np.zeros(max_k)
            # One-hot encoding the row index here
            pos_encoding[i] = 1.0
            features = np.concatenate([P[i], meta, pos_encoding])
            row_features.append(features)

        X = np.array(row_features)
        X_list.append(X)
        # Log scaling the target here
        y_list.append(np.log(row['h_m_C']))
    return np.array(X_list), np.array(y_list)

'''
  After much experimentation, developed a transformer based model.
  This stems from the idea that m-height may depend on interactions between different rows.
  Thus a transformer can model pairwise and gobal dependencies via self-attention.

  Using transformer is better than fully connected layers alone because,
  MLPs treat all rows flatly and independently,
  CNNs assume local patterns, which is not present in a mathematical concept,
  Thus transformers can learn attention weights dynamically across all rows.
'''
def build_transformer_model(input_shape):
    inputs = keras.Input(shape=input_shape)

    # Normalization
    x = layers.LayerNormalization()(inputs)
    # Self-attention
    x = layers.MultiHeadAttention(num_heads=2, key_dim=8)(x, x)
    # For variable-length inputs
    x = layers.GlobalAveragePooling1D()(x)

    # Dense layer with L2 regularization
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
    # Finally predict a single log(h_m_C) value
    outputs = layers.Dense(1)(x)

    model = keras.Model(inputs, outputs)
    # MSE loss is being used in log-scale
    model.compile(optimizer='adam', loss='mse')
    return model

# Training, saving and evaluating model for each combination of (n, k, m) groups
results = []
unique_groups = sorted(train_df.groupby(['n', 'k', 'm']).groups.keys())

os.makedirs('/content/models', exist_ok=True)

for (n, k, m) in unique_groups:
    print(f"\nTraining model for (n={n}, k={k}, m={m})")

    # Filtering group-specific data from each dataset split
    train_group = train_df[(train_df['n'] == n) & (train_df['k'] == k) & (train_df['m'] == m)]
    val_group = val_df[(val_df['n'] == n) & (val_df['k'] == k) & (val_df['m'] == m)]
    test_group = test_df[(test_df['n'] == n) & (test_df['k'] == k) & (test_df['m'] == m)]

    # Skipping groups which do not have even a single data, though this condition should never be reached
    if len(train_group) == 0 or len(val_group) == 0 or len(test_group) == 0:
        print("Skipping due to insufficient data.")
        continue

    # Applying preprocessing on each split
    X_train, y_train = preprocess_group(train_group)
    X_val, y_val = preprocess_group(val_group)
    X_test, y_test_log = preprocess_group(test_group)

    # Training and model building
    # Increasing epochs from 30 to 80, as validation accuracy performs little better again after 50 epochs as seen while experimenting
    model = build_transformer_model(input_shape=X_train.shape[1:])
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=80,
        batch_size=32,
        verbose=0,
        callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
    )

    model_path = f'/content/models/model_n{n}_k{k}_m{m}.h5'
    model.save(model_path)
    print(f"Model saved to {model_path}")

    # Prediction on the test set
    # Log base e value is predicted, which is converted to actual value
    y_pred_log = model.predict(X_test).flatten()
    y_pred = np.exp(y_pred_log)
    y_true = np.exp(y_test_log)

    # Clipping predictions to ensure valid domain for log2
    y_pred = np.maximum(y_pred, 1.0)
    y_true = np.maximum(y_true, 1.0)

    # Computing RMSLE in log base 2 space
    # This is done as the test cost function in the project scope suggests so
    y_pred_log2 = np.log2(y_pred)
    y_true_log2 = np.log2(y_true)
    rmsle = np.sqrt(mean_squared_log_error(y_true_log2, y_pred_log2))
    results.append(((n, k, m), rmsle))
    print(f"RMSLE (log base 2): {rmsle:.4f}")

print("\n=== Summary of Results ===")
for (n, k, m), rmsle in results:
    print(f"Group (n={n}, k={k}, m={m}): RMSLE (log base 2) = {rmsle:.4f}")


Mounted at /content/drive
Maximum k (rows in P): 6

Training model for (n=9, k=4, m=2)




Model saved to /content/models/model_n9_k4_m2.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0526

Training model for (n=9, k=4, m=3)




Model saved to /content/models/model_n9_k4_m3.h5
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0524

Training model for (n=9, k=4, m=4)




Model saved to /content/models/model_n9_k4_m4.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0829

Training model for (n=9, k=4, m=5)




Model saved to /content/models/model_n9_k4_m5.h5
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1155

Training model for (n=9, k=5, m=2)




Model saved to /content/models/model_n9_k5_m2.h5
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0506

Training model for (n=9, k=5, m=3)




Model saved to /content/models/model_n9_k5_m3.h5
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0795

Training model for (n=9, k=5, m=4)




Model saved to /content/models/model_n9_k5_m4.h5
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.1094

Training model for (n=9, k=6, m=2)




Model saved to /content/models/model_n9_k6_m2.h5
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0716

Training model for (n=9, k=6, m=3)




Model saved to /content/models/model_n9_k6_m3.h5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1152

Training model for (n=10, k=4, m=2)




Model saved to /content/models/model_n10_k4_m2.h5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.1386

Training model for (n=10, k=4, m=3)




Model saved to /content/models/model_n10_k4_m3.h5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0374

Training model for (n=10, k=4, m=4)




Model saved to /content/models/model_n10_k4_m4.h5
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0552

Training model for (n=10, k=4, m=5)




Model saved to /content/models/model_n10_k4_m5.h5
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0824

Training model for (n=10, k=4, m=6)




Model saved to /content/models/model_n10_k4_m6.h5
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.1110

Training model for (n=10, k=5, m=2)




Model saved to /content/models/model_n10_k5_m2.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0382

Training model for (n=10, k=5, m=3)




Model saved to /content/models/model_n10_k5_m3.h5
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0581

Training model for (n=10, k=5, m=4)




Model saved to /content/models/model_n10_k5_m4.h5
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
RMSLE (log base 2): 0.0786

Training model for (n=10, k=5, m=5)




Model saved to /content/models/model_n10_k5_m5.h5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.1044

Training model for (n=10, k=6, m=2)




Model saved to /content/models/model_n10_k6_m2.h5
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSLE (log base 2): 0.0505

Training model for (n=10, k=6, m=3)




Model saved to /content/models/model_n10_k6_m3.h5
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.0770

Training model for (n=10, k=6, m=4)




Model saved to /content/models/model_n10_k6_m4.h5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSLE (log base 2): 0.1030

=== Summary of Results ===
Group (n=9, k=4, m=2): RMSLE (log base 2) = 0.0526
Group (n=9, k=4, m=3): RMSLE (log base 2) = 0.0524
Group (n=9, k=4, m=4): RMSLE (log base 2) = 0.0829
Group (n=9, k=4, m=5): RMSLE (log base 2) = 0.1155
Group (n=9, k=5, m=2): RMSLE (log base 2) = 0.0506
Group (n=9, k=5, m=3): RMSLE (log base 2) = 0.0795
Group (n=9, k=5, m=4): RMSLE (log base 2) = 0.1094
Group (n=9, k=6, m=2): RMSLE (log base 2) = 0.0716
Group (n=9, k=6, m=3): RMSLE (log base 2) = 0.1152
Group (n=10, k=4, m=2): RMSLE (log base 2) = 0.1386
Group (n=10, k=4, m=3): RMSLE (log base 2) = 0.0374
Group (n=10, k=4, m=4): RMSLE (log base 2) = 0.0552
Group (n=10, k=4, m=5): RMSLE (log base 2) = 0.0824
Group (n=10, k=4, m=6): RMSLE (log base 2) = 0.1110
Group (n=10, k=5, m=2): RMSLE (log base 2) = 0.0382
Group (n=10, k=5, m=3): RMSLE (log base 2) = 0.05

This model is trained over 80 epochs rather than 30 epochs as in earlier submissions. Earlier I had only experimented till 50 epochs, and saw that validation loss was not decreasing after approximtely 30. But further experimentation showed that loss was again decreasing after 65 and converging near 80. Thus I finally came up with the above model.

Next I generated another dataset of 100k rows, to further check how the model is performing on completely unseen data, and data that has been generated in a different session. This dataset was saved in my drive with the name 'm_height_dataset_1k.pkl'

In [1]:
import os
import joblib
import numpy as np
from zipfile import ZipFile
from google.colab import drive
from tensorflow import keras
from sklearn.metrics import mean_squared_log_error

'''
  Usual preprocessing done before too while training
'''
drive.mount('/content/drive')

test_file_path = '/content/drive/My Drive/m_height_100k.pkl'
with open(test_file_path, 'rb') as f:
    test_df = joblib.load(f)


# Fixing max_k based on analysis during training
max_k = 6


def preprocess_group(group_df):
    X_list, y_list = [], []
    for _, row in group_df.iterrows():
        P = np.array(row['P']).astype(np.float32)
        n, k, m = row['n'], row['k'], row['m']
        meta = np.array([n, k, m], dtype=np.float32)

        row_features = []
        for i in range(P.shape[0]):
            pos_encoding = np.zeros(max_k)
            pos_encoding[i] = 1.0
            features = np.concatenate([P[i], meta, pos_encoding])
            row_features.append(features)

        X = np.array(row_features)
        X_list.append(X)
        y_list.append(np.log(row['h_m_C']))
    return np.array(X_list), np.array(y_list)

'''
  The final model was saved with the name '436000715_models_3.zip'.
  That is now uploaded to runtime and unzipped.
  The test data is first grouped into bins of each combinations
  of (n, k, m), and then the pertaining model is invoked.
'''
model_zip_path = '436000715_models_3.zip'
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

with ZipFile(model_zip_path, 'r') as zip_ref:
    zip_ref.extractall(model_dir)

# These variables keep track of the evaluation metrics, globally across all models
all_preds_log2 = []
all_trues_log2 = []

custom_objects = {'mse': keras.losses.MeanSquaredError()}
# Splitting data into groups
unique_groups = sorted(test_df.groupby(['n', 'k', 'm']).groups.keys())

for (n, k, m) in unique_groups:
    model_path = os.path.join(model_dir, f"model_n{n}_k{k}_m{m}.h5")
    #This condition should not be reached, but an error check
    if not os.path.exists(model_path):
        continue

    group_df = test_df[(test_df['n'] == n) & (test_df['k'] == k) & (test_df['m'] == m)]
    if len(group_df) == 0:
        continue

    X_test, y_test_log = preprocess_group(group_df)
    model = keras.models.load_model(model_path, custom_objects=custom_objects)

    y_pred_log = model.predict(X_test).flatten()
    y_pred = np.exp(y_pred_log)
    y_true = np.exp(y_test_log)

    y_pred = np.maximum(y_pred, 1.0)
    y_true = np.maximum(y_true, 1.0)

    all_preds_log2.extend(np.log2(y_pred))
    all_trues_log2.extend(np.log2(y_true))

    # To check the results, printing some actual vs predicted m-heights
    print(f"\n--- Predictions for Group (n={n}, k={k}, m={m}) ---")
    num_samples = min(len(y_true), 5)
    for i in range(num_samples):
        print(f"Sample {i+1}: Actual = {y_true[i]:.2f}, Predicted = {y_pred[i]:.2f}")

# Calculating all the metrics globally
if all_preds_log2:
    all_preds_log2 = np.array(all_preds_log2)
    all_trues_log2 = np.array(all_trues_log2)

    global_rmsle = np.sqrt(mean_squared_log_error(all_trues_log2, all_preds_log2))
    avg_cost = np.mean((all_trues_log2 - all_preds_log2) ** 2)

    print("\n=== Global Test Evaluation ===")
    print(f"Total Samples: {len(all_preds_log2)}")
    print(f"Global Average Cost σ: {avg_cost:.4f}")
    print(f"Global RMSLE (log base 2): {global_rmsle:.4f}")
else:
    print("No valid predictions found.")


Mounted at /content/drive




[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- Predictions for Group (n=9, k=4, m=2) ---
Sample 1: Actual = 194.03, Predicted = 167.92
Sample 2: Actual = 175.25, Predicted = 123.55
Sample 3: Actual = 152.03, Predicted = 162.49
Sample 4: Actual = 176.14, Predicted = 162.44
Sample 5: Actual = 183.21, Predicted = 164.12




[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

--- Predictions for Group (n=9, k=4, m=3) ---
Sample 1: Actual = 210.28, Predicted = 262.85
Sample 2: Actual = 177.91, Predicted = 240.23
Sample 3: Actual = 220.43, Predicted = 278.38
Sample 4: Actual = 252.63, Predicted = 266.35
Sample 5: Actual = 215.02, Predicted = 276.86




[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=9, k=4, m=4) ---
Sample 1: Actual = 3134.83, Predicted = 906.64
Sample 2: Actual = 1434.17, Predicted = 1001.30
Sample 3: Actual = 1550.49, Predicted = 913.07
Sample 4: Actual = 1109.48, Predicted = 945.98
Sample 5: Actual = 684.32, Predicted = 931.54




[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- Predictions for Group (n=9, k=4, m=5) ---
Sample 1: Actual = 152304.11, Predicted = 23641.25
Sample 2: Actual = 17305.33, Predicted = 23438.11
Sample 3: Actual = 8202.88, Predicted = 23877.78
Sample 4: Actual = 32600.17, Predicted = 27123.62
Sample 5: Actual = 20170.64, Predicted = 24692.74




[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

--- Predictions for Group (n=9, k=5, m=2) ---
Sample 1: Actual = 309.63, Predicted = 305.62
Sample 2: Actual = 469.47, Predicted = 316.68
Sample 3: Actual = 403.88, Predicted = 315.38
Sample 4: Actual = 386.85, Predicted = 298.82
Sample 5: Actual = 388.77, Predicted = 322.59




[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

--- Predictions for Group (n=9, k=5, m=3) ---
Sample 1: Actual = 978.70, Predicted = 1019.35
Sample 2: Actual = 960.01, Predicted = 1272.76
Sample 3: Actual = 1099.08, Predicted = 1112.95
Sample 4: Actual = 868.14, Predicted = 1555.80
Sample 5: Actual = 511.28, Predicted = 1199.97




[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

--- Predictions for Group (n=9, k=5, m=4) ---
Sample 1: Actual = 2564955.57, Predicted = 30582.79
Sample 2: Actual = 93582.35, Predicted = 33000.75
Sample 3: Actual = 167199.64, Predicted = 32582.73
Sample 4: Actual = 20684.01, Predicted = 30662.72
Sample 5: Actual = 28919.13, Predicted = 24961.13




[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

--- Predictions for Group (n=9, k=6, m=2) ---
Sample 1: Actual = 525.49, Predicted = 695.66
Sample 2: Actual = 471.29, Predicted = 791.74
Sample 3: Actual = 434.34, Predicted = 622.97
Sample 4: Actual = 587.23, Predicted = 629.42
Sample 5: Actual = 911.90, Predicted = 759.77




[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

--- Predictions for Group (n=9, k=6, m=3) ---
Sample 1: Actual = 29425.84, Predicted = 19425.82
Sample 2: Actual = 6838.80, Predicted = 19857.34
Sample 3: Actual = 5219.96, Predicted = 18310.61
Sample 4: Actual = 28336.84, Predicted = 18667.55
Sample 5: Actual = 71185.12, Predicted = 19519.19




[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- Predictions for Group (n=10, k=4, m=2) ---
Sample 1: Actual = 14.81, Predicted = 55.82
Sample 2: Actual = 103.84, Predicted = 57.40
Sample 3: Actual = 126.22, Predicted = 56.13
Sample 4: Actual = 120.04, Predicted = 64.31
Sample 5: Actual = 112.29, Predicted = 62.02




[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- Predictions for Group (n=10, k=4, m=3) ---
Sample 1: Actual = 180.68, Predicted = 176.29
Sample 2: Actual = 103.32, Predicted = 141.31
Sample 3: Actual = 151.41, Predicted = 171.91
Sample 4: Actual = 181.67, Predicted = 173.15
Sample 5: Actual = 194.50, Predicted = 174.59




[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=4, m=4) ---
Sample 1: Actual = 590.14, Predicted = 328.57
Sample 2: Actual = 402.80, Predicted = 339.41
Sample 3: Actual = 233.70, Predicted = 345.02
Sample 4: Actual = 207.79, Predicted = 346.45
Sample 5: Actual = 228.45, Predicted = 336.85




[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=4, m=5) ---
Sample 1: Actual = 2226.66, Predicted = 1333.21
Sample 2: Actual = 868.86, Predicted = 1368.02
Sample 3: Actual = 615.25, Predicted = 1394.03
Sample 4: Actual = 6058.14, Predicted = 1352.46
Sample 5: Actual = 833.53, Predicted = 1307.23




[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=4, m=6) ---
Sample 1: Actual = 26225.24, Predicted = 45911.00
Sample 2: Actual = 61018.48, Predicted = 38873.52
Sample 3: Actual = 16677.29, Predicted = 46147.51
Sample 4: Actual = 34827.39, Predicted = 34512.10
Sample 5: Actual = 103531.57, Predicted = 39448.57




[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=5, m=2) ---
Sample 1: Actual = 175.19, Predicted = 248.47
Sample 2: Actual = 226.14, Predicted = 242.13
Sample 3: Actual = 221.82, Predicted = 227.76
Sample 4: Actual = 311.82, Predicted = 241.39
Sample 5: Actual = 165.30, Predicted = 186.37




[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=5, m=3) ---
Sample 1: Actual = 1502.48, Predicted = 443.31
Sample 2: Actual = 460.90, Predicted = 425.93
Sample 3: Actual = 370.51, Predicted = 460.68
Sample 4: Actual = 440.30, Predicted = 453.29
Sample 5: Actual = 633.06, Predicted = 461.51




[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=5, m=4) ---
Sample 1: Actual = 3283.15, Predicted = 2038.50
Sample 2: Actual = 1513.29, Predicted = 1961.28
Sample 3: Actual = 3004.40, Predicted = 2022.57
Sample 4: Actual = 5171.45, Predicted = 1970.75
Sample 5: Actual = 1494.52, Predicted = 1986.07




[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

--- Predictions for Group (n=10, k=5, m=5) ---
Sample 1: Actual = 177047.48, Predicted = 73448.66
Sample 2: Actual = 246821.81, Predicted = 62059.19
Sample 3: Actual = 23983.40, Predicted = 73319.95
Sample 4: Actual = 29059.81, Predicted = 61583.29
Sample 5: Actual = 41024.03, Predicted = 70926.96




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

--- Predictions for Group (n=10, k=6, m=2) ---
Sample 1: Actual = 738.55, Predicted = 429.65
Sample 2: Actual = 413.91, Predicted = 465.36
Sample 3: Actual = 285.07, Predicted = 435.67
Sample 4: Actual = 1027.39, Predicted = 412.63
Sample 5: Actual = 874.60, Predicted = 454.37




[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

--- Predictions for Group (n=10, k=6, m=3) ---
Sample 1: Actual = 1619.49, Predicted = 1733.02
Sample 2: Actual = 1884.39, Predicted = 1730.80
Sample 3: Actual = 1319.60, Predicted = 1745.96
Sample 4: Actual = 1010.55, Predicted = 1873.62
Sample 5: Actual = 4016.91, Predicted = 1856.90




[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

--- Predictions for Group (n=10, k=6, m=4) ---
Sample 1: Actual = 137656.18, Predicted = 61240.79
Sample 2: Actual = 40116.68, Predicted = 65626.51
Sample 3: Actual = 31498.34, Predicted = 59551.20
Sample 4: Actual = 485405.97, Predicted = 67428.75
Sample 5: Actual = 246626.37, Predicted = 64087.20

=== Global Test Evaluation ===
Total Samples: 100000
Global Average Cost σ: 1.4455
Global RMSLE (log base 2): 0.0851


The next cell runs the evaluation on the test set. As said in the project scope, upload the models directory shared, named, '436000715_models.zip'.
The function call has been done. Just update the dictionary 'inputs', to your evaluation dictionary. The format has been followed as in the template. Next, the cell returns a dictionary as output, and prints it.

After inserting your input in the placeholder, and uploading the zip file, run the following cell.

In [None]:
import os
import joblib
import numpy as np
from zipfile import ZipFile
from google.colab import drive
from tensorflow import keras
from sklearn.metrics import mean_squared_log_error

def predict_m_heights(inputs, model_dir, max_k=6):
    outputs = {}
    custom_objects = {'mse': keras.losses.MeanSquaredError()}

    for key, matrices in inputs.items():
        # Extracting n, k, m from the dictionary
        n, k, m = eval(key)
        model_path = os.path.join(model_dir, f"model_n{n}_k{k}_m{m}.h5")

        # Skipping, in case model is not found
        if not os.path.exists(model_path):
            print(f"Model for group (n={n}, k={k}, m={m}) not found.")
            continue

        # Loading the model here
        model = keras.models.load_model(model_path, custom_objects=custom_objects)
        preds = []

        # Preprocessing the P_list
        for P in matrices:
            P = np.array(P).astype(np.float32)
            meta = np.array([n, k, m], dtype=np.float32)

            row_features = []
            for i in range(P.shape[0]):
                pos_encoding = np.zeros(max_k)
                pos_encoding[i] = 1.0
                features = np.concatenate([P[i], meta, pos_encoding])
                row_features.append(features)

            X = np.array([row_features])
            y_pred_log = model.predict(X, verbose=0).flatten()[0]
            y_pred = np.exp(y_pred_log)
            y_pred = max(1.0, y_pred)
            preds.append(y_pred)

        outputs[key] = preds

    return outputs


# Sample input is shown, modify the input with test set here
'''
inputs = {
    '[9,6,3]': [
        np.array([
            [ 0.4759809,  0.9938236, 0.819425 ],
            [-0.8960798, -0.7442706, 0.3345122],
            [ 0.4759809,  0.9938236, 0.819425 ],
            [-0.8960798, -0.7442706, 0.3345122],
            [ 0.4759809,  0.9938236, 0.819425 ],
            [-0.8960798, -0.7442706, 0.3345122],
        ]),
        # Add more P matrices here if needed

    ],
}
'''
# Insert here
inputs = {

}

model_zip_path = '436000715_models_3.zip'
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

with ZipFile(model_zip_path, 'r') as zip_ref:
    zip_ref.extractall(model_dir)

# Calling the required function here
outputs = predict_m_heights(inputs, model_dir, max_k=6)

print("\n=== Outputs ===")
for key in outputs:
    preds = [float(x) for x in outputs[key]]
    print(f"{key} => {preds}")




=== Outputs ===
[9,6,3] => [4725.53173828125]
