In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Load data
mordred_df = pd.read_csv("Mordred_Descriptors.csv", encoding="ISO-8859-1")
training_df = pd.read_csv("TASK1_training.csv", encoding="ISO-8859-1")
stimulus_df = pd.read_csv("TASK1_Stimulus_definition.csv", encoding="ISO-8859-1")
leaderboard_df = pd.read_csv("TASK1_leaderboard_set_Submission_form.csv", encoding="ISO-8859-1")
test_df = pd.read_csv("TASK1_test_set_Submission_form.csv", encoding="ISO-8859-1")

merged_features = pd.merge(stimulus_df, mordred_df, on="molecule")
train_full = pd.merge(training_df, merged_features, on="stimulus")

to_drop = [
    "stimulus", "molecule", "dilution", "solvent", "Intensity_label",
    "Intensity", "Pleasantness"
]
drop_cols = [col for col in to_drop if col in train_full.columns]
train_features = train_full.drop(columns=drop_cols)

# Drop non-numeric columns
object_cols = train_features.select_dtypes(include='object').columns
train_features = train_features.drop(columns=object_cols)

# Extract targets
target_cols = training_df.columns.tolist()[2:]  # descriptors start from column 3
train_targets = train_full[target_cols]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_features)
X = pd.DataFrame(X_scaled, columns=train_features.columns)

# Drop constant (zero-variance) columns
X = X.loc[:, X.std() > 0]

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)  # Impute missing values safely



# Align targets
y = train_targets.loc[X.index]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

# ---- Prediction Function ----
def prepare_and_predict(submission_df, features_df):
    merged = pd.merge(submission_df, features_df, on="stimulus")

    # Drop any unwanted metadata or identifier columns
    drop_candidates = [
        "stimulus", "CID_main", "dilution", "solvent",
        "Intensity_label", "CID_mordred", "Dilution", "Solvent", "IntensityLevel", "molecule"
    ]
    drop_cols = [col for col in drop_candidates if col in merged.columns]

    # Drop object/string columns
    X_submit = merged.drop(columns=drop_cols)
    X_submit = X_submit.drop(columns=X_submit.select_dtypes(include='object').columns)

    # Scale and align with training columns
    X_submit_scaled = scaler.transform(X_submit)
    X_submit_scaled = pd.DataFrame(X_submit_scaled, columns=X_submit.columns)
    # Replace infs with NaNs
    X_submit_scaled.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Fill NaNs with 0 (safe imputation) or drop them
    X_submit_scaled.fillna(0, inplace=True)  # or use a trained imputer
    X_submit_scaled = X_submit_scaled[X.columns]  # Keep only training-time columns

    # Predict
    y_submit_pred = model.predict(X_submit_scaled)

    # Format output
    result_df = pd.DataFrame(y_submit_pred, columns=train_targets.columns)
    result_df.insert(0, "stimulus", submission_df["stimulus"].values)
    return result_df

# ---- Predictions ----

# Predict on leaderboard
leaderboard_predictions = prepare_and_predict(leaderboard_df, merged_features)
leaderboard_predictions.to_csv("TASK1_leaderboard_set_Submission_form.csv", index=False)

# Predict on test set
test_predictions = prepare_and_predict(test_df, merged_features)
test_predictions.to_csv("TASK1_Test_Predictions.csv", index=False)

print("Predictions saved to 'TASK1_leaderboard_set_Submission_form.csv' and 'TASK1_Test_Predictions.csv'")


Validation RMSE: 0.0959
Predictions saved to 'TASK1_Leaderboard_Predictions.csv' and 'TASK1_Test_Predictions.csv'


In [None]:
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
import numpy as np


def average_metrics(y_true, y_pred):
    pearson_scores = []
    cosine_distances = []
    for i in range(y_true.shape[0]):
        true_vec = y_true.iloc[i].values
        pred_vec = y_pred[i]

        # Pearson correlation
        corr, _ = pearsonr(true_vec, pred_vec)
        pearson_scores.append(corr)

        # Cosine distance
        cos_dist = cosine(true_vec, pred_vec)
        cosine_distances.append(cos_dist)

    avg_pearson = np.nanmean(pearson_scores)
    avg_cosine_dist = np.nanmean(cosine_distances)

    print(f"Average Pearson Correlation: {avg_pearson:.4f}")
    print(f"Average Cosine Distance: {avg_cosine_dist:.4f}")
    return avg_pearson, avg_cosine_dist

y_val_pred=model.predict(X_val)
average_metrics(y_val, y_val_pred)

Average Pearson Correlation: 0.9951
Average Cosine Distance: 0.0043


(np.float64(0.9951466272046495), np.float64(0.0043111152356154646))

Average Pearson Correlation is close to 1 which means the model's predictions are very strongly correlated with the true perceptual profiles.

Average Cosine Distance is close to 0. So the model's prediction profiles are closely aligned to the direction with the real descriptor profiles.