In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

molecules = pd.read_csv('molecules_dataset_cleared.csv')
display(molecules.shape)
X = molecules.drop(columns=['id', 'SMILES', 'SELFIES', 'ACCUMULATION'])
Y = (molecules['ACCUMULATION'] > 500).astype(int)

def my_train_test_split(test_size=0.3, random_state=35):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state*569%37)
    scaler = StandardScaler()
    X_scaled_train = scaler.fit_transform(X_train)
    X_scaled_test = scaler.transform(X_test)
    return X_scaled_train, X_scaled_test, Y_train, Y_test

(209, 204)

In [11]:
def print_statistics(scores, non_zero_weights = None):
    if non_zero_weights is not None:
        union_non_zero_weights = set()
        print("Non-zero weights count: ", end='')
        for weights in non_zero_weights:
            print(len(weights), end=' ')
            for weight in weights:
                union_non_zero_weights.add(weight)
        print(f'\nUnion length: {len(union_non_zero_weights)}')

        intersection_non_zero_weights = set(non_zero_weights[0])
        for weights in non_zero_weights[1:]:
            w = set(weights)
            intersection_non_zero_weights = intersection_non_zero_weights.intersection(w)
        print(f'Intersection length: {len(intersection_non_zero_weights)}')
        print(f'Properties count: {X.shape[1]}')

    print('\nScores: ', end=' ')
    for score in scores:
        print(f'{score:.3f}', end=' ')
    print(f'\nMean score: {np.mean(scores):.3f}')

In [12]:
from sklearn.linear_model import LogisticRegression

scores = []
non_zero_weights = []
for i in range(0, 10):
    X_scaled_train, X_scaled_test, Y_train, Y_test = my_train_test_split(0.3, i)
    model = LogisticRegression(solver='liblinear', penalty='l1', C=0.5)
    model.fit(X_scaled_train, Y_train)
    weights = model.coef_
    non_zero_weights.append([col for col, weight in zip(X.columns, weights[0]) if weight != 0])
    intercept = model.intercept_
    # print(f'Weights: {weights}')
    # print(f'Intercept: {intercept}')
    scores.append(model.score(X_scaled_test, Y_test))
    # print(f'Score: {scores[-1]}')

print_statistics(scores, non_zero_weights)

Non-zero weights count: 32 30 28 22 30 24 26 31 31 34 
Union length: 77
Intersection length: 2
Properties count: 200

Scores:  0.889 0.873 0.889 0.778 0.889 0.841 0.857 0.778 0.794 0.857 
Mean score: 0.844


In [13]:
from sklearn.svm import SVC

scores = []
non_zero_weights = []
for i in range(0, 10):
    X_scaled_train, X_scaled_test, Y_train, Y_test = my_train_test_split(0.3, i)
    model = SVC(C=1, kernel='linear')
    model.fit(X_scaled_train, Y_train)
    scores.append(model.score(X_scaled_test, Y_test))
    weights = model.coef_
    non_zero_weights.append([col for col, weight in zip(X.columns, weights[0]) if weight != 0])

print_statistics(scores, non_zero_weights)

Non-zero weights count: 170 170 167 170 171 169 170 170 165 172 
Union length: 173
Intersection length: 158
Properties count: 200

Scores:  0.905 0.889 0.905 0.825 0.905 0.825 0.889 0.794 0.841 0.905 
Mean score: 0.868


In [14]:
from sklearn.ensemble import RandomForestClassifier

scores = []
for i in range(0, 10):
    X_scaled_train, X_scaled_test, Y_train, Y_test = my_train_test_split(0.3, i)
    model = RandomForestClassifier()
    model.fit(X_scaled_train, Y_train)
    scores.append(model.score(X_scaled_test, Y_test))
    non_zero_weights.append([col for col, weight in zip(X.columns, weights[0]) if weight != 0])

print_statistics(scores)


Scores:  0.857 0.889 0.873 0.810 0.810 0.794 0.873 0.905 0.841 0.825 
Mean score: 0.848
