In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats import pearsonr

components_df = pd.read_csv("TASK2_Component_definition.csv")
stimuli_df = pd.read_csv("TASK2_Stimulus_definition.csv")
train_mixtures_df = pd.read_csv("TASK2_Train_mixture_Dataset.csv")
mordred_df = pd.read_csv("Mordred_Descriptors.csv", encoding='latin1')
morgan_df = pd.read_csv("Morgan_fingerprints.csv", encoding='latin1')
openpom_df = pd.read_csv("OpenPOM_Dream_RATA.csv", encoding='latin1')
cid_df = pd.read_csv("CID.csv")

# === 2. Build component feature table === #
# Merge Mordred + Morgan on molecule, then add SMILES from cid_df, then merge OpenPOM
features_df = mordred_df.merge(morgan_df, on="molecule")
features_df = features_df.merge(cid_df[["molecule", "SMILES"]], on="molecule", how="left")
features_df = features_df.merge(openpom_df, on="SMILES", how="left")

# Merge with component definition
components_features = components_df.merge(features_df, left_on="CID", right_on="molecule", how="left")

# === 3. Aggregate features per mixture === #
stimuli_df["component_list"] = stimuli_df["components"].apply(lambda x: x.split(";"))
stimulus_to_components = dict(zip(stimuli_df["id"], stimuli_df["component_list"]))

def get_mixture_features(stimulus_id):
    comp_ids = stimulus_to_components.get(stimulus_id, [])
    comp_feats = []
    for cid in comp_ids:
        row = components_features[components_features["id"] == int(cid)]
        if not row.empty:
            numeric_row = row.select_dtypes(include=[np.number]).iloc[0].values
            comp_feats.append(numeric_row)
    if len(comp_feats) == 0:
        return np.full((features_df.shape[1] - 1 + openpom_df.shape[1] - 1,), np.nan)
    return np.nanmean(comp_feats, axis=0)

# === 4. Prepare training set === #
X, y = [], []

for idx, row in train_mixtures_df.iterrows():
    stimulus_id = row["stimulus"]
    feats = get_mixture_features(stimulus_id)
    if np.all(np.isnan(feats)):
        continue
    X.append(feats)
    y.append(row.iloc[3:].values.astype(float))  # 51 descriptors

X = np.array(X)
y = np.array(y)

# === 5. Train model === #
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

# === 6. Challenge Metrics === #
cosine = np.mean([cosine_distances([a], [b])[0][0] for a, b in zip(y_val, y_pred)])
pearson = np.mean([pearsonr(a, b)[0] for a, b in zip(y_val, y_pred)])
print("Mean Cosine Distance:", cosine)
print("Mean Pearson Correlation:", pearson)

# === 7. Predict Leaderboard === #
leaderboard_df = pd.read_csv("TASK2_Leaderboard_set_Submission_form.csv")
leaderboard_predictions = []

for stimulus_id in leaderboard_df["stimulus"]:
    feats = get_mixture_features(stimulus_id)
    if np.any(np.isnan(feats)):
        feats = np.zeros_like(X[0])
    pred = model.predict([feats])[0]
    leaderboard_predictions.append(pred)

submission = leaderboard_df.copy()
submission.iloc[:, 1:] = leaderboard_predictions
submission.to_csv("TASK2_Leaderboard_set_Submission_form.csv", index=False)
print("✅ Submission file saved as TASK2_Leaderboard_Submission.csv")


  return np.nanmean(comp_feats, axis=0)


Validation RMSE: 0.22830453278079704
Mean Cosine Distance: 0.1634914978477015
Mean Pearson Correlation: 0.7406923744729683


  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)
  return np.nanmean(comp_feats, axis=0)


✅ Submission file saved as TASK2_Leaderboard_Submission.csv
