# Resultados de Métricas del Modelo
Este notebook calcula las métricas clave para:
- Modelo log-exp (número de coeficientes)
- Modelo lineal (número de qubits)
Incluye R², RMSE, MAE, error relativo, desviación estándar y validación cruzada.

In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm
from scipy import stats

# === Cargar datos ===
with open("AllMolecules_QMProt_Format.json", "r") as f:
    data = json.load(f)

def extract_data(mols):
    return [(m["n_electrons"], m["n_qubits"], m["n_coefficients"])
            for m in mols if m["n_qubits"] > 0 and m["n_coefficients"] > 0]

original_data = extract_data([mol for mol in data["amino_acids"] if mol["n_electrons"] > 0])
X_qm, y_qubits_qm, y_coef_qm = zip(*original_data)
X_qm = np.array(X_qm, dtype=np.float64)
y_qubits_qm = np.array(y_qubits_qm, dtype=np.float64)
y_coef_qm = np.array(y_coef_qm, dtype=np.float64)


In [None]:
# === Modelo log-exp para coeficientes ===
X_log = X_qm.reshape(-1, 1)
y_log = np.log(y_coef_qm)

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y_log, test_size=0.2, random_state=42)
model_logexp = LinearRegression().fit(X_train_log, y_train_log)

y_train_pred_log = model_logexp.predict(X_train_log)
y_test_pred_log = model_logexp.predict(X_test_log)

r2_total_logexp = model_logexp.score(X_log, y_log)
r2_train_log = model_logexp.score(X_train_log, y_train_log)
r2_test_log = model_logexp.score(X_test_log, y_test_log)

rmse_log = np.sqrt(mean_squared_error(y_test_log, y_test_pred_log))
mae_log = mean_absolute_error(y_test_log, y_test_pred_log)

y_test_exp = np.exp(y_test_log)
y_pred_exp = np.exp(y_test_pred_log)
rmse_nat = np.sqrt(mean_squared_error(y_test_exp, y_pred_exp))
mae_nat = mean_absolute_error(y_test_exp, y_pred_exp)
rel_error = np.mean(np.abs((y_test_exp - y_pred_exp) / y_test_exp)) * 100

std_coef = np.std(y_coef_qm)
std_coef_pct = (std_coef / np.mean(y_coef_qm)) * 100

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_r2_log = cross_val_score(LinearRegression(), X_log, y_log, cv=cv, scoring='r2')


In [None]:
# === Modelo lineal para qubits ===
X_q = X_qm.reshape(-1, 1)
y_q = y_qubits_qm

X_train_q, X_test_q, y_train_q, y_test_q = train_test_split(X_q, y_q, test_size=0.2, random_state=42)
model_q = LinearRegression().fit(X_train_q, y_train_q)

y_train_pred_q = model_q.predict(X_train_q)
y_test_pred_q = model_q.predict(X_test_q)

r2_total_q = model_q.score(X_q, y_q)
r2_train_q = model_q.score(X_train_q, y_train_q)
r2_test_q = model_q.score(X_test_q, y_test_q)

rmse_q = np.sqrt(mean_squared_error(y_test_q, y_test_pred_q))
mae_q = mean_absolute_error(y_test_q, y_test_pred_q)

std_q = np.std(y_qubits_qm)
std_q_pct = (std_q / np.mean(y_qubits_qm)) * 100

cv_r2_q = cross_val_score(LinearRegression(), X_q, y_q, cv=cv, scoring='r2')


In [None]:
# === Imprimir resumen de resultados ===
print("Modelo log-exp (coeficientes):")
print(f"  R2 total: {r2_total_logexp:.3f}")
print(f"  R2 CV (train): {r2_train_log:.3f} | R2 CV (test): {r2_test_log:.3f}")
print(f"  R2 CV 5-fold (mean): {np.mean(cv_r2_log):.3f}")
print(f"  RMSE (log): {rmse_log:.3f} | MAE (log): {mae_log:.3f}")
print(f"  RMSE (natural): {rmse_nat:.2e} | MAE (natural): {mae_nat:.2e}")
print(f"  Mean relative error: {rel_error:.2f}%")
print(f"  Std (natural): {std_coef:.2e} | Std %: {std_coef_pct:.2f}%\n")

print("Modelo lineal (qubits):")
print(f"  R2 total: {r2_total_q:.3f}")
print(f"  R2 CV (train): {r2_train_q:.3f} | R2 CV (test): {r2_test_q:.3f}")
print(f"  R2 CV 5-fold (mean): {np.mean(cv_r2_q):.3f}")
print(f"  RMSE: {rmse_q:.3f} | MAE: {mae_q:.3f}")
print(f"  Std: {std_q:.2f} | Std %: {std_q_pct:.2f}%")


In [None]:
# === Train RANSAC models to ensure predictions exist ===
from sklearn.linear_model import RANSACRegressor

# For coefficients (log scale)
X_log = X_qm.reshape(-1, 1)
y_log = np.log(y_coef_qm)
ransac_coef = RANSACRegressor(LinearRegression(), residual_threshold=1.0, random_state=42)
ransac_coef.fit(X_log, y_log)
y_ransac_pred_log = ransac_coef.predict(X_log)

# For qubits (natural scale)
X_q = X_qm.reshape(-1, 1)
y_q = y_qubits_qm
ransac_q = RANSACRegressor(LinearRegression(), residual_threshold=5.0, random_state=42)
ransac_q.fit(X_q, y_q)
y_ransac_pred_q = ransac_q.predict(X_q)


## Full Comparison of OLS vs RANSAC Metrics

| Metric                       | OLS (coefficients)   | RANSAC (coefficients)   | OLS (qubits)   | RANSAC (qubits)   |
|------------------------------|----------------------|-------------------------|----------------|-------------------|
| R² (total)                   | 0.973                | -                       | 0.973          | -                 |
| R² (train)                   | 0.972                | -                       | 0.972          | -                 |
| R² (test)                    | 0.976                | -                       | 0.976          | -                 |
| R² CV (5-fold mean)          | 0.955                | -                       | 0.955          | -                 |
| MAE                          | 3.08e+30             | 1.95 (log)              | 3.14           | 2.77              |
| RMSE                         | 1.23e+31             | 3.15 (log)              | 3.50           | 4.24              |
| Standard Deviation           | 4.87e+47             | 2.26e+45                | 22.10          | 3.76              |
| Coefficient of Variation (%) | 877.50%              | 4.08%                   | 68.32%         | 11.63%            |

In [None]:
# Visualización comparativa OLS vs RANSAC
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle("Comparación OLS vs RANSAC", fontsize=14, weight="bold")

# Coefficients
axs[0].scatter(X_qm, y_log, color="skyblue", label="Log(Coefficients) Original")
axs[0].plot(X_qm, model_logexp.predict(X_log), color="blue", label="OLS Fit", linewidth=2)
axs[0].plot(X_qm, y_ransac_pred_log, color="navy", linestyle="--", label="RANSAC Fit", linewidth=2)
axs[0].set_xlabel("# of electrons")
axs[0].set_ylabel("log(# of coefficients)")
axs[0].set_title("Log-Coefficients Fit: OLS vs RANSAC")
axs[0].legend()
axs[0].grid(True)

# Qubits
axs[1].scatter(X_qm, y_qubits_qm, color="orchid", label="Qubits Original")
axs[1].plot(X_qm, model_q.predict(X_q), color="purple", label="OLS Fit", linewidth=2)
axs[1].plot(X_qm, y_ransac_pred_q, color="darkmagenta", linestyle="--", label="RANSAC Fit", linewidth=2)
axs[1].set_xlabel("# of electrons")
axs[1].set_ylabel("# of qubits")
axs[1].set_title("Qubits Fit: OLS vs RANSAC")
axs[1].legend()
axs[1].grid(True)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
