In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit.Chem as Chem
from rdkit.Chem import Draw

## Load dataset

In [None]:
df = pd.read_csv("../datasets/mcule_purchasable_in_stock_prices_valid_smiles.csv")

## Create Molecules

In [None]:
def create_mol(smiles):
    return Chem.MolFromSmiles(smiles)

df['mol']  = df['SMILES'].apply(lambda x: create_mol(x))

## First model : Morgan fingerprints

In [None]:
# Calculate the fingerprints
from rdkit.Chem import AllChem
from rdkit import DataStructs

def get_fingerprint(mol):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

df['fp'] = df['mol'].apply(get_fingerprint)

## Machine learning model : Multilinear regression

Split the dataset into training and test sets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
def data_split(X, y):
    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    X_train, X_validation, y_train, y_validation = train_test_split(
        X_train, y_train, test_size=0.01, random_state=0)

    return X_train, X_test, y_train, y_test, X_validation, y_validation


#X_train, X_test, y_train, y_test, X_validation, y_validation = data_split(df['fp'], df['price 1 (USD)'])

### Train the model

In [None]:
# train the model 
model = LinearRegression()
model.fit(X_train[:600000].tolist(), y_train[:600000].tolist())

In [None]:
# evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test[:100000].tolist())
print("Mean squared error: %.2f"% mean_squared_error(y_test[:100000].tolist(), y_pred[:100000].tolist()))

In [None]:
# Plot
plt.scatter(y_test[:100000].tolist(), y_pred[:100000], marker='+', color='black')
plt.plot(y_test[:100000].tolist(), y_test[:100000].tolist(), color='red')

plt.xlabel('True prices [USD]')
plt.ylabel('Predicted prices [USD]')
plt.text(0, 550, f'$R^2$={r2_score(y_test[:100000].tolist(), y_pred[:100000].tolist()):.2f}')
plt.text(0, 500, f'MSE={mean_squared_error(y_test[:100000].tolist(), y_pred[:100000].tolist()):.2f}')
plt.savefig("morgan_fp_linear_regression_test_100000_train_600000.png", dpi=800)
plt.show()


## Using descriptors

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# import randomforest 
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv("../datasets/mordred_descriptors.csv")

In [None]:
df_X = df.drop("price 1 (USD)", axis=1)
df_y = df["price 1 (USD)"]

In [None]:
# train linear model
X_train, X_test, y_train, y_test, X_validation, y_validation = data_split(df_X, df_y)

model = RandomForestRegressor(n_estimators=200, max_depth=30)
model.fit(X_train, y_train.tolist())

In [None]:
# evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

In [None]:
# Plot
plt.scatter(y_test.tolist(), y_pred, marker='+', color='black')
plt.plot(y_test.tolist(), y_test.tolist(), color='red')

plt.xlabel('True prices [USD]')
plt.ylabel('Predicted prices [USD]')
plt.text(0, 550, f'$R^2$={r2_score(y_test.tolist(), y_pred.tolist()):.2f}')
plt.text(0, 500, f'MSE={mean_squared_error(y_test.tolist(), y_pred.tolist()):.2f}')
plt.savefig("morgan_fp_RF_test_200_train_800.png", dpi=800)
plt.show()