Random Forest

In [3]:
import pandas as pd
import numpy as np

In [4]:
data_train = pd.read_csv("train/train_80.csv")
data_test = pd.read_csv("train/valid_20.csv")

In [5]:
data_train.columns
# with open("ma_liste.txt", "w", encoding="utf-8") as fichier:
#     for element in data.columns:
#         fichier.write(element + ',') 

Index(['smiles', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1',
       'Chi1n', 'Chi1v', 'Chi2n',
       ...
       'fcfc_2039', 'fcfc_2040', 'fcfc_2041', 'fcfc_2042', 'fcfc_2043',
       'fcfc_2044', 'fcfc_2045', 'fcfc_2046', 'fcfc_2047', 'class'],
      dtype='object', length=4297)

In [6]:
fcfc_cols = [col for col in data_train.columns if col.startswith("fcfc_") or col.startswith("ecfc_")]


# Remove columns that are not needed
X_train = data_train[fcfc_cols]
X_test = data_test[fcfc_cols]
y_train = data_train["class"]
y_test = data_test["class"]


In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, cohen_kappa_score,make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import optuna
from sklearn.model_selection import cross_val_score
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        
    }

    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring=make_scorer(cohen_kappa_score))
    return scores.mean()

In [14]:
# Lancer l’optimisation
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=31, n_jobs=-1)

# Afficher les meilleurs résultats
print("Best params:", study.best_params)
print("Best CV score (Kappa):", study.best_value)

[I 2025-05-25 06:40:16,550] A new study created in memory with name: no-name-aba67ea3-93f8-4cff-818d-7293b654a618
[I 2025-05-25 06:40:25,387] Trial 5 finished with value: 0.5081739236556705 and parameters: {'n_estimators': 112, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 5 with value: 0.5081739236556705.
[I 2025-05-25 06:40:31,362] Trial 2 finished with value: 0.5258751336836655 and parameters: {'n_estimators': 230, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 2 with value: 0.5258751336836655.
[I 2025-05-25 06:40:32,981] Trial 4 finished with value: 0.5080853413942649 and parameters: {'n_estimators': 133, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 0.5258751336836655.
[I 2025-05-25 06:40:40,726] Trial 1 finished with value: 0.46643149944351503

Best params: {'n_estimators': 272, 'max_depth': 24, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}
Best CV score (Kappa): 0.6044072785131892


In [10]:

# cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# kappa_scorer = make_scorer(cohen_kappa_score)
# param_grid = {
#     'n_estimators': [225],
#     'max_depth': [None],
#     'min_samples_split': [2],
#     'min_samples_leaf' : [1]
# }

# grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring=kappa_scorer, cv=cv, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

# print("Best params:", grid_search.best_params_)
# print("Best CV score:", grid_search.best_score_)



In [15]:
best_rf = study.best_params
best_rf = RandomForestClassifier(**best_rf, random_state=42)
best_rf.fit(X_train, y_train)

In [17]:
# on sauvegarde
joblib.dump(best_rf, 'FP_random_forest.pkl')

['FP_random_forest.pkl']

In [18]:
# On récupère le meilleur modèle et on entraine sur l'ensemble d'entraînement
rf_model = joblib.load('FP_random_forest.pkl')

In [None]:
y_pred = rf_model.predict(X_test)
print(y_pred)
print("Cohen's Kappa:", cohen_kappa_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

proba= rf_model.predict_proba(X_test)
# Create a DataFrame for the predictions and probabilities
print(proba[:, 1])
predictions_df = pd.DataFrame({
    'smiles': data_test['smiles'],
    'proba': proba[:, 1],
})

predictions_df.to_csv('results/FP_random_forest.csv', index=False)

feature_importances = rf_model.feature_importances_
feature_importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print("Feature Importances:\n", feature_importances_df)

# Save results to CSV




[0 1 1 ... 0 0 1]
Cohen's Kappa: 0.6287067355037599
Accuracy: 0.8141263940520446
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       916
           1       0.84      0.78      0.81       967

    accuracy                           0.81      1883
   macro avg       0.82      0.81      0.81      1883
weighted avg       0.82      0.81      0.81      1883

Confusion Matrix:
 [[775 141]
 [209 758]]
[0.12922926 0.87192226 0.81418911 ... 0.10216909 0.42639623 0.83673344]


OSError: Cannot save file into a non-existent directory: 'results'