In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('liion.csv')
y = df['s'].values
X = df.drop(["formula","source","target","composition","family","ChemicalFamily","log_target","s","mean simul. packing efficiency","mean abs simul. packing efficiency"], axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state=42)

In [4]:
rf = RandomForestClassifier(random_state=42)

In [5]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['sqrt'],
    'max_depth': [4, 6, 8, 10, None],
    'criterion': ['gini', 'entropy']
}

In [6]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [7]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

Best parameters found: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 50}
Best cross-validation accuracy: 0.8460


In [8]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"Test set accuracy with best parameters: {accuracy_score(y_test, y_pred):.4f}")

Test set accuracy with best parameters: 0.8659


In [9]:
knn = KNeighborsClassifier()

In [10]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  
    'weights': ['uniform', 'distance'],  
    'metric': ['euclidean', 'manhattan'] 
}

In [11]:
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [12]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

Best parameters found: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best cross-validation accuracy: 0.8552


In [13]:
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
print(f"Test set accuracy with best parameters: {accuracy_score(y_test, y_pred):.4f}")

Test set accuracy with best parameters: 0.8598


In [14]:
svc = SVC()

In [15]:
param_grid = {
    'C': [0.1, 1, 10],  
    'gamma': [1, 0.1, 0.01, 0.001],  
    'kernel': ['linear', 'rbf']  
}

In [16]:
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [17]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

Best parameters found: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.8277


In [18]:
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)
print(f"Test set accuracy with best parameters: {accuracy_score(y_test, y_pred):.4f}")

Test set accuracy with best parameters: 0.8049


In [19]:
gb_clf = GradientBoostingClassifier(random_state=42)

In [20]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [21]:
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [22]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

Best parameters found: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy: 0.8780


In [23]:
best_gb_clf = grid_search.best_estimator_
y_pred = best_gb_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best parameters: {:.4f}".format(test_accuracy))

Test set accuracy with best parameters: 0.8780
