In [1]:
%matplotlib inline

# Preliminaries 

In [2]:
import os
import pandas as pd
from ase.io import read
import numpy as np
from sklearn.model_selection import  cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing, linear_model, pipeline, model_selection
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from scipy.spatial.distance import pdist
from dscribe.descriptors import CoulombMatrix
from scipy.spatial.transform import Rotation as R

# Energies 

In [3]:
# Path setup
data_dir = "./"  
atoms_train_dir = os.path.join(data_dir, "atoms", "train")
energies_csv = os.path.join(data_dir, "energies", "train.csv")

atoms_test_dir = os.path.join(data_dir, "atoms", "test")

In [4]:
energies_df = pd.read_csv(energies_csv)
energies_df["id"] = energies_df["id"].astype(str)
energies_df = energies_df.sort_values("id")
print("Loaded energy data:\n", energies_df.head().to_string(index=False))

Loaded energy data:
   id     energy
   1 -90.107880
  10 -69.210846
 100 -64.983899
1000 -76.559740
1001 -62.270961


# Positions and charges 

In [5]:
def extract_features(xyz_path):
    atoms = read(xyz_path)
    positions = atoms.get_positions()
    charges = atoms.get_atomic_numbers()
    return positions, charges

def load_all_xyz(folder_path, max_atoms=23):
    positions_list = []
    charges_list = []
    
    xyz_files = sorted([
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith('.xyz')
    ])

    for xyz_path in xyz_files:
        pos, chg = extract_features(xyz_path)
        
        pos_padded = np.zeros((max_atoms, 3))
        chg_padded = np.zeros((max_atoms,))
        
        n_atoms = pos.shape[0]
        pos_padded[:n_atoms, :] = pos
        chg_padded[:n_atoms] = chg

        positions_list.append(pos_padded)
        charges_list.append(chg_padded)

    positions_array = np.stack(positions_list)  # shape (N, max_atoms, 3)
    charges_array = np.stack(charges_list)      # shape (N, max_atoms)
    
    return positions_array, charges_array

# Load orders 0,1 and 2

## Scattering only (SO)

In [6]:
order_0_train_SO = np.load('/home/besbesines/kymatio_cache/results/train/order_0_molecule_L_3_J_2_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0].npy')
orders_1_and_2_train_SO = np.load('/home/besbesines/kymatio_cache/results/train/orders_1_and_2molecule_L_3_J_2_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0].npy')

order_0_test_SO = np.load('/home/besbesines/kymatio_cache/results/test/order_0_molecule_L_3_J_2_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0].npy')
orders_1_and_2_test_SO = np.load('/home/besbesines/kymatio_cache/results/test/orders_1_and_2molecule_L_3_J_2_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0].npy')

## Scattering from the article

In [7]:
order_0_train_article = np.load('/home/besbesines/kymatio_cache/results/train/order_0_molecule_L_3_J_4_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0, 4.0].npy')
orders_1_and_2_train_article = np.load('/home/besbesines/kymatio_cache/results/train/orders_1_and_2molecule_L_3_J_4_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0, 4.0].npy')

order_0_test_article = np.load('/home/besbesines/kymatio_cache/results/test/order_0_molecule_L_3_J_4_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0, 4.0].npy')
orders_1_and_2_test_article = np.load('/home/besbesines/kymatio_cache/results/test/orders_1_and_2molecule_L_3_J_4_sigma_2.0_MNO_(96, 64, 48)_powers_[0.5, 1.0, 2.0, 3.0, 4.0].npy')

# Compute additional features

## Pairwise distance

In [8]:
def compute_pairwise_distance_stats(positions, charges):
    n_samples = positions.shape[0]
    mean_dists = []
    std_dists = []
    for i in range(n_samples):
        mask = charges[i] != 0
        pos_i = positions[i][mask]
        if len(pos_i) < 2:
            mean_dists.append(0.0)
            std_dists.append(0.0)
        else:
            dists = pdist(pos_i)
            mean_dists.append(dists.mean())
            std_dists.append(dists.std())
    return np.array(mean_dists), np.array(std_dists)


## Geometric features

In [9]:
def compute_geometric_features(positions, charges):
    n_samples = positions.shape[0]
    mean_dist_to_center = []
    spatial_variance = []
    for i in range(n_samples):
        mask = charges[i] != 0
        pos_i = positions[i][mask]
        if len(pos_i) == 0:
            mean_dist_to_center.append(0.0)
            spatial_variance.append(0.0)
        else:
            center = pos_i.mean(axis=0)
            dist = np.linalg.norm(pos_i - center, axis=1)
            mean_dist_to_center.append(dist.mean())
            spatial_variance.append(dist.var())
    return np.array(mean_dist_to_center), np.array(spatial_variance)


## Atomic type frequencies

In [10]:
def compute_atomic_type_frequencies(charges, atomic_types=[1, 6, 7, 8, 9, 16]):
    freqs = []
    for i in range(charges.shape[0]):
        count = [(charges[i] == Z).sum() for Z in atomic_types]
        freqs.append(count)
    return np.array(freqs)  # shape: (n_molecules, len(atomic_types))

## All features combined

In [11]:
def compute_all_features(positions, charges):
    n_atoms = (charges != 0).sum(axis=1)
    total_charge = charges.sum(axis=1)

    mean_atomic_number = np.zeros_like(n_atoms, dtype=float)
    mask = n_atoms != 0
    mean_atomic_number[mask] = total_charge[mask] / n_atoms[mask]
    std_atomic_number = np.std(charges, axis=1)

    mean_dist, std_dist = compute_pairwise_distance_stats(positions, charges)
    mean_disp, spatial_var = compute_geometric_features(positions, charges)
    type_freqs = compute_atomic_type_frequencies(charges)

    return np.column_stack([
        n_atoms,
        total_charge,
        mean_atomic_number,
        std_atomic_number,
        mean_dist,
        std_dist,
        mean_disp,
        spatial_var,
        type_freqs
    ])

In [12]:
pos_train, full_charges_train = load_all_xyz("./atoms/train/", max_atoms=23)
n_molecules_train = pos_train.shape[0]
additional_features_train = compute_all_features(pos_train, full_charges_train)

pos_test, full_charges_test = load_all_xyz("./atoms/test/", max_atoms=23)
n_molecules_test = pos_test.shape[0]
additional_features_test = compute_all_features(pos_test, full_charges_test)

# Invariance by translation, permutation and rotation

In [None]:
def test_translation_invariance(positions, charges):
    original_features = compute_all_features(positions, charges)

    shift = np.random.uniform(-10, 10, size=(1, 1, 3)) 
    translated_positions = positions + shift

    translated_features = compute_all_features(translated_positions, charges)

    difference = np.abs(original_features - translated_features)
    max_diff = difference.max()
    print(f"Max difference after translation: {max_diff:.5e}")
    return np.allclose(original_features, translated_features, atol=1e-5)

def test_permutation_invariance(positions, charges):
    original_features = compute_all_features(positions, charges)
    
    permuted_positions = np.copy(positions)
    permuted_charges = np.copy(charges)

    for i in range(positions.shape[0]):
        n_atoms = (charges[i] != 0).sum()
        perm = np.random.permutation(n_atoms)
        permuted_positions[i, :n_atoms] = positions[i, :n_atoms][perm]
        permuted_charges[i, :n_atoms] = charges[i, :n_atoms][perm]

    permuted_features = compute_all_features(permuted_positions, permuted_charges)

    difference = np.abs(original_features - permuted_features)
    max_diff = difference.max()
    print(f"Max difference after permutation: {max_diff:.5e}")
    return np.allclose(original_features, permuted_features, atol=1e-5)


def test_rotation_invariance(positions, charges):
    original_features = compute_all_features(positions, charges)
    
    rotated_positions = np.copy(positions)

    for i in range(positions.shape[0]):
        n_atoms = (charges[i] != 0).sum()
        if n_atoms == 0:
            continue

        rotation = R.random().as_matrix() 
        pos_i = positions[i, :n_atoms]
        rotated = pos_i @ rotation.T 
        
        rotated_positions[i, :n_atoms] = rotated

    rotated_features = compute_all_features(rotated_positions, charges)
    
    difference = np.abs(original_features - rotated_features)
    max_diff = difference.max()
    print(f"Max difference after rotation: {max_diff:.5e}")
    return np.allclose(original_features, rotated_features, atol=1e-5)


In [36]:
ok_translation = test_translation_invariance(pos_train, full_charges_train)
print("Translation invariance:", "Yes" if ok_translation else "No")

ok_permutation = test_permutation_invariance(pos_train, full_charges_train)
print("Permutation invariance:", "Yes" if ok_permutation else "No")

ok_rotation = test_rotation_invariance(pos_train, full_charges_train)
print("Rotation invariance:", "Yes" if ok_rotation else "No")

Max difference after translation: 1.33227e-15
Translation invariance: Yes
Max difference after permutation: 1.77636e-15
Permutation invariance: Yes
Max difference after rotation: 2.22045e-15
Rotation invariance: Yes


# Find the best model

## Function $find\_best\_model$

In [13]:
def find_best_model(models, X_train, y_train, cross_val_folds=5):
    alphas = 10.0 ** (-np.arange(1, 10))
    best_score = np.inf
    best_model = None
    best_alpha = None

    results = []

    for name, model in models.items():
        if name == "Ridge":
            for alpha in alphas:
                scaler = preprocessing.StandardScaler()
                ridge = linear_model.Ridge(alpha=alpha)
                regressor = pipeline.make_pipeline(scaler, ridge)
                target_prediction = model_selection.cross_val_predict(regressor, X=X_train, y=y_train, cv=cross_val_folds)
                RMSE = np.sqrt(np.mean((target_prediction - y_train) ** 2))

                if RMSE < best_score:
                    best_score = RMSE
                    best_model = regressor
                    best_alpha = alpha

            print(f"Ridge regression, best alpha: {best_alpha}, RMSE: {best_score}")
            results.append((name, best_score))
        else:
            pipelin = pipeline.make_pipeline(preprocessing.StandardScaler(), model)
            neg_mse_scores = model_selection.cross_val_score(pipelin, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cross_val_folds)
            rmse_scores = -neg_mse_scores
            mean_rmse = np.mean(rmse_scores)
            print(f"{name}: RMSE = {mean_rmse:.4f}")
            results.append((name, mean_rmse))

    best_model_name, best_rmse = min(results, key=lambda x: x[1])
    print(f"\nMeilleur modèle : {best_model_name} avec une RMSE de {best_rmse:.4f}")

    return best_model, best_model_name, best_rmse

In [14]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100),
    "XGBoost": XGBRegressor(n_estimators=100, objective='reg:squarederror')
}

# Scattering with $J=2$ and $L=3$

### Without additional features

In [23]:
train_scattering_coef_scattering = np.concatenate([order_0_train_SO, orders_1_and_2_train_SO], axis=1)
test_scattering_coef_scattering = np.concatenate([order_0_test_SO, orders_1_and_2_test_SO], axis=1)

In [30]:
X_train = train_scattering_coef_scattering
y_train = energies_df['energy'].to_list()

X_test = test_scattering_coef_scattering

In [31]:
best_model_scattering, best_model_name_scattering, best_rmse_scattering = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 1.4048


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Ridge regression, best alpha: 0.001, RMSE: 0.9863005401003615
Lasso: RMSE = 3.9411
SVR: RMSE = 3.1570
KNN: RMSE = 2.5536
DecisionTree: RMSE = 2.4808
RandomForest: RMSE = 1.7964
GradientBoosting: RMSE = 1.8254
XGBoost: RMSE = 1.7296

Meilleur modèle : Ridge avec une RMSE de 0.9863


In [32]:
best_model_scattering.fit(X_train, y_train)

In [33]:
predictions_test = best_model_scattering.predict(X_test)

results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_only.csv', index=False)

### With additional features 

In [17]:
train_scattering_coef = np.concatenate([order_0_train_SO, orders_1_and_2_train_SO], axis=1)
test_scattering_coef = np.concatenate([order_0_test_SO, orders_1_and_2_test_SO], axis=1)

In [18]:
X_train = np.concatenate([train_scattering_coef, additional_features_train], axis=1)
y_train = energies_df['energy'].to_list()

X_test = np.concatenate([test_scattering_coef, additional_features_test], axis=1)

In [23]:
best_model_SO, best_model_name_SO, best_rmse_SO = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 0.3254
Ridge regression, best alpha: 0.01, RMSE: 0.23510447573636828
Lasso: RMSE = 1.4311
SVR: RMSE = 2.0530
KNN: RMSE = 1.6786
DecisionTree: RMSE = 0.6922
RandomForest: RMSE = 0.5090
GradientBoosting: RMSE = 0.5171
XGBoost: RMSE = 0.3946

Meilleur modèle : Ridge avec une RMSE de 0.2351


In [24]:
best_model_SO.fit(X_train, y_train)

In [25]:
predictions_test = best_model_SO.predict(X_test)

results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_with_other_features.csv', index=False)

# Scattering with $J=4$ and $L=3$

### Without additional features

In [15]:
train_scattering_coef_article = np.concatenate([order_0_train_article, orders_1_and_2_train_article], axis=1)
test_scattering_coef_article = np.concatenate([order_0_test_article, orders_1_and_2_test_article], axis=1)

In [42]:
X_train = train_scattering_coef_article
y_train = energies_df['energy'].to_list()

X_test = test_scattering_coef_article

In [43]:
best_model_article, best_model_name_article, best_rmse_article = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 13.2485


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Ridge regression, best alpha: 0.01, RMSE: 1.140476944724858
Lasso: RMSE = 3.9379
SVR: RMSE = 3.1241
KNN: RMSE = 2.7233
DecisionTree: RMSE = 2.6183
RandomForest: RMSE = 1.8438
GradientBoosting: RMSE = 1.7767
XGBoost: RMSE = 1.6889

Meilleur modèle : Ridge avec une RMSE de 1.1405


In [44]:
best_model_article.fit(X_train, y_train)

In [45]:
predictions_test = best_model_article.predict(X_test)

results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_only_article.csv', index=False)

### With additional features

In [46]:
X_train = np.concatenate([train_scattering_coef_article, additional_features_train], axis=1)
y_train = energies_df['energy'].to_list()

X_test = np.concatenate([test_scattering_coef_article, additional_features_test], axis=1)

In [47]:
best_model_article_and_features, best_model_name_article_and_features, best_rmse_article_and_features = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 2.8695
Ridge regression, best alpha: 0.01, RMSE: 0.22952343951001308
Lasso: RMSE = 1.4311
SVR: RMSE = 2.5384
KNN: RMSE = 2.4487
DecisionTree: RMSE = 0.8628
RandomForest: RMSE = 0.5480
GradientBoosting: RMSE = 0.5134
XGBoost: RMSE = 0.4346

Meilleur modèle : Ridge avec une RMSE de 0.2295


In [51]:
best_model_article_and_features.fit(X_train, y_train)

In [52]:
predictions_test = best_model_article_and_features.predict(X_test)

results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_article_with_other_features.csv', index=False)

# Scattering with Coulomb matrix

## Create Coulomb matrix

In [17]:
def create_train_coulomb_matrix(data_dir, file_ids, target_values, n_atoms_max):
    cm_sorted = CoulombMatrix(n_atoms_max=n_atoms_max, permutation="sorted_l2")
    cm_raw = CoulombMatrix(n_atoms_max=n_atoms_max, permutation="none")

    X_raw, X_sorted, y_raw, y_sorted = [], [], [], []
    atom_labels_list = []

    for i, mol_id in enumerate(file_ids):
        xyz_path = os.path.join(data_dir, f"id_{mol_id}.xyz")
        if os.path.isfile(xyz_path):
            try:
                atoms = read(xyz_path)
                atom_names = atoms.get_chemical_symbols()

                if len(atom_names) < n_atoms_max:
                    atom_names += [""] * (n_atoms_max - len(atom_names))
                
                atom_labels_list.append(atom_names)  

                cm_r = cm_raw.create(atoms).reshape((n_atoms_max, n_atoms_max))
                X_raw.append(cm_r)
                y_raw.append(target_values[i])

                cm_s = cm_sorted.create(atoms).reshape((n_atoms_max, n_atoms_max))
                X_sorted.append(cm_s)
                y_sorted.append(target_values[i])

            except Exception as e:
                print(f"Erreur de lecture pour {xyz_path}: {e}")
        else:
            print(f"Fichier manquant : {xyz_path}")

    X_raw = np.array(X_raw)
    X_sorted = np.array(X_sorted)
    y_raw = np.array(y_raw)
    y_sorted = np.array(y_sorted)

    y_raw = np.array(y_raw)
    y_sorted = np.array(y_sorted)
    return X_raw, X_sorted, y_raw, y_sorted, atom_labels_list

In [18]:
def create_test_coulomb_matrix(data_dir, id_start=6592, id_end=8238, n_atoms_max=23):
    cm_sorted = CoulombMatrix(n_atoms_max=n_atoms_max, permutation="sorted_l2")
    cm_raw = CoulombMatrix(n_atoms_max=n_atoms_max, permutation="none")

    X_raw, X_sorted = [], []
    test_ids = []

    for mol_id in range(id_start, id_end + 1):
        filename = f"id_{mol_id}.xyz"
        xyz_path = os.path.join(data_dir, filename)

        if os.path.isfile(xyz_path):
            try:
                atoms = read(xyz_path)

                # Create raw Coulomb matrix
                cm_r = cm_raw.create(atoms).reshape((n_atoms_max, n_atoms_max))
                X_raw.append(cm_r)

                # Create sorted Coulomb matrix
                cm_s = cm_sorted.create(atoms).reshape((n_atoms_max, n_atoms_max))
                X_sorted.append(cm_s)

                test_ids.append(mol_id)

            except Exception as e:
                print(f"Error reading {filename}: {e}")
        else:
            print(f"File missing: {filename}")

    X_raw = np.array(X_raw)
    X_sorted = np.array(X_sorted)

    return X_raw, X_sorted, test_ids

### Without additional features

In [19]:
file_ids = energies_df["id"].tolist()
target_values = energies_df["energy"].values
n_atoms_max = 23  

In [20]:
X_train_raw, X_train_sorted, y_train_raw, y_train_sorted, train_atom_labels = create_train_coulomb_matrix(
    atoms_train_dir, file_ids, target_values, n_atoms_max
)
X_test_raw, X_test_sorted,test_ids = create_test_coulomb_matrix(atoms_test_dir)

In [21]:
X_raw_flattened_train = X_train_raw.reshape(X_train_raw.shape[0], -1) 
X_sorted_flattened_train = X_train_sorted.reshape(X_train_sorted.shape[0], -1)  

X_raw_flattened_test = X_test_sorted.reshape(X_test_raw.shape[0], -1)  
X_sorted_flattened_test = X_test_sorted.reshape(X_test_sorted.shape[0], -1)  

In [22]:
X_train = np.concatenate([train_scattering_coef_article,X_raw_flattened_train], axis=1)
y_train = energies_df['energy'].to_list()

X_test = np.concatenate([test_scattering_coef_article,X_raw_flattened_test], axis=1)

In [23]:
best_model_scattering_coulomb, best_model_name_scattering_coulomb, best_rmse_scattering_coulomb = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 23.4822


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Ridge regression, best alpha: 0.1, RMSE: 0.4057552227619756
Lasso: RMSE = 2.9372
SVR: RMSE = 2.6690
KNN: RMSE = 1.1561
DecisionTree: RMSE = 0.8756
RandomForest: RMSE = 0.5787
GradientBoosting: RMSE = 0.6095
XGBoost: RMSE = 0.5406

Meilleur modèle : Ridge avec une RMSE de 0.4058


In [24]:
best_model_scattering_coulomb.fit(X_train, y_train)

In [25]:
predictions_test = best_model_scattering_coulomb.predict(X_test)

results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_article_coulomb_raw_only.csv', index=False)

### With additional features

In [26]:
X_train = np.concatenate([train_scattering_coef_article, X_raw_flattened_train, additional_features_train], axis=1)
y_train = energies_df['energy'].to_list()

X_test = np.concatenate([test_scattering_coef_article, X_raw_flattened_test, additional_features_test], axis=1)

In [27]:
best_model_scattering_coulomb_and_features, best_model_name_scattering_coulomb_and_features, best_rmse_scattering_coulomb_and_features = find_best_model(models, X_train, y_train, cross_val_folds=5)

LinearRegression: RMSE = 8.5514


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Ridge regression, best alpha: 0.01, RMSE: 0.17181378487798485
Lasso: RMSE = 1.4311
SVR: RMSE = 2.5456
KNN: RMSE = 1.1042
DecisionTree: RMSE = 0.6699
RandomForest: RMSE = 0.4884
GradientBoosting: RMSE = 0.4749
XGBoost: RMSE = 0.4272

Meilleur modèle : Ridge avec une RMSE de 0.1718


In [31]:
best_model_scattering_coulomb_and_features.fit(X_train, y_train)

In [32]:
predictions_test = best_model_scattering_coulomb_and_features.predict(X_test)

In [33]:
results = pd.DataFrame({
    'id': np.arange(len(predictions_test)),
    'energy': predictions_test
})
results = results.sort_values(by='energy').reset_index(drop=True)
results['id'] = results['id'] + 6592
results.to_csv('results/scattering_article_coulomb_raw_with_other_features.csv', index=False)