In [45]:
import json
import tensorflow as tf
from tensorflow.keras import Input
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from rdkit import Chem


## Dataset

In [46]:
path= '/Users/utilizador/Desktop/ML-data-analysis-main/random_samples_final.pkl'

In [47]:
random_samples=pd.read_pickle(filepath_or_buffer= path)

In [48]:
random_samples.shape


(10000, 148)

In [49]:
output = random_samples['Y']
input= random_samples[['morgan_fingerprints', 'Gene_expression']]


## Dataset Split

In [50]:
x_train, x_val, y_train, y_val = train_test_split(input, output, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5, random_state=42)

In [51]:
x_train_drugs = x_train.filter(regex='morgan', axis=1)
x_train_cell_lines = x_train.iloc[:, x_train.columns.get_loc('morgan_fingerprints') + 1:]

x_val_drugs = x_val.filter(regex='morgan', axis=1)
x_val_cell_lines = x_val.iloc[:, x_val.columns.get_loc('morgan_fingerprints') + 1:]

x_test_drugs = x_test.filter(regex='morgan', axis=1)
x_test_cell_lines = x_test.iloc[:, x_test.columns.get_loc('morgan_fingerprints') + 1:]

In [52]:
x_train_drugs = pd.concat([x_train_drugs.drop(['morgan_fingerprints'], axis=1), x_train_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)
x_test_drugs = pd.concat([x_test_drugs.drop(['morgan_fingerprints'], axis=1), x_test_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)
x_val_drugs = pd.concat([x_val_drugs.drop(['morgan_fingerprints'], axis=1), x_val_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)

In [53]:
x_train_drugs

Unnamed: 0,fp0,fp1,fp2,fp3,fp4,fp5,fp6,fp7,fp8,fp9,...,fp2038,fp2039,fp2040,fp2041,fp2042,fp2043,fp2044,fp2045,fp2046,fp2047
71716,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
152829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12619,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149429,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47981,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48999,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
x_train_cell = x_train_cell_lines['Gene_expression'].apply(pd.Series)
x_train_cell.columns = [f'gene_{i}' for i in range(x_train_cell.shape[1])]
x_train_cell_lines = pd.concat([x_train_cell_lines.drop(columns=['Gene_expression']), x_train_cell], axis=1)

#####
x_test_cell = x_test_cell_lines['Gene_expression'].apply(pd.Series)
x_test_cell.columns = [f'gene_{i}' for i in range(x_test_cell.shape[1])]
x_test_cell_lines = pd.concat([x_test_cell_lines.drop(columns=['Gene_expression']), x_test_cell], axis=1)

#####
x_val_cell = x_val_cell_lines['Gene_expression'].apply(pd.Series)
x_val_cell.columns = [f'gene_{i}' for i in range(x_val_cell.shape[1])]
x_val_cell_lines = pd.concat([x_val_cell_lines.drop(columns=['Gene_expression']), x_val_cell], axis=1)

In [55]:
x_train_cell_lines

Unnamed: 0,gene_0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_13339,gene_13340,gene_13341,gene_13342,gene_13343,gene_13344,gene_13345,gene_13346,gene_13347,gene_13348
71716,7.188877,8.928756,4.663555,3.416280,3.096305,6.826444,3.323095,5.147193,6.679149,3.238289,...,3.235135,4.905903,4.098310,2.811946,5.429300,2.770339,8.937286,2.599858,9.414679,7.273168
152829,6.629262,9.313841,3.961542,3.407503,3.027680,6.590986,8.095744,7.727150,6.079737,3.240734,...,3.320698,4.280165,4.017937,2.634867,5.302352,3.331810,8.152317,3.389978,9.206188,7.482943
12619,3.454855,9.246209,4.628587,4.432062,3.354918,3.255320,6.697093,5.299671,5.436861,3.538260,...,3.474007,4.141157,4.737547,2.636211,3.529433,3.240956,9.338429,9.267335,8.664311,8.074440
149429,8.072295,9.169414,3.702357,3.329170,3.034988,4.603880,8.439930,8.285421,4.541539,3.426734,...,3.739961,4.688540,3.064446,2.681717,4.279433,2.920413,8.477269,3.314085,9.177474,7.965078
164112,9.241336,9.865577,3.932945,3.688377,3.240228,5.724733,9.110355,5.433205,5.685882,3.340002,...,3.244485,4.215788,3.566160,2.619875,4.485273,3.121554,8.716633,3.061928,9.722662,6.754493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47981,5.051663,9.024239,4.196993,3.726976,3.267616,3.389011,6.646962,5.949845,5.393282,3.981700,...,3.004081,4.765050,3.968262,2.657251,3.338593,3.107874,8.656260,2.720959,8.986667,8.070330
48999,3.303111,10.344945,6.027395,3.439446,4.974175,3.269596,7.853677,5.432638,5.932224,3.683839,...,3.185001,3.562078,5.118068,2.671580,5.755938,3.181971,8.000509,3.078481,9.206829,8.991101
11139,3.363446,9.604299,3.931429,3.835969,3.216174,4.528885,3.272706,6.099419,5.824356,3.635076,...,4.028962,4.106226,4.059121,2.602437,4.653821,3.083361,8.595628,6.496206,9.026961,7.325749
32449,7.646055,9.467647,4.024392,3.400185,3.406030,9.578489,7.716921,6.283608,4.776286,3.580554,...,3.187405,4.545189,3.256456,2.576372,4.336076,2.948622,8.031265,3.333082,8.756423,7.684665


In [56]:
x_train_drugs.shape, x_train_cell_lines.shape, y_train.shape

((6000, 2048), (6000, 13349), (6000,))

In [57]:
x_test_drugs.shape, x_test_cell_lines.shape, y_test.shape

((2000, 2048), (2000, 13349), (2000,))

In [58]:
x_val_drugs.shape, x_val_cell_lines.shape, y_val.shape

((2000, 2048), (2000, 13349), (2000,))

In [59]:
y_train

71716    -1.050204
152829    1.140108
12619     4.773284
149429    2.032094
164112    4.692048
            ...   
47981     3.856404
48999     1.494062
11139     3.466243
32449     2.484426
54008     3.507105
Name: Y, Length: 6000, dtype: float64

In [60]:
y_val

4468      0.515744
73063     3.724621
78403     5.581613
114638   -1.723433
142341   -1.488753
            ...   
151943   -3.099915
8219      3.981576
155396    1.500072
27332     1.022270
107440    0.761226
Name: Y, Length: 2000, dtype: float64

In [61]:
print("Tipo de y_train:", type(y_train))
print("Dtype de y_train:", y_train.dtype)
print("Exemplo de y_train:", y_train[:5])


Tipo de y_train: <class 'pandas.core.series.Series'>
Dtype de y_train: float64
Exemplo de y_train: 71716    -1.050204
152829    1.140108
12619     4.773284
149429    2.032094
164112    4.692048
Name: Y, dtype: float64


In [63]:
y_train = y_train.astype('float32')


In [66]:
print(x_train_cell_lines.shape)
print(x_train_drugs.shape)

print(y_train.shape)


(6000, 13349)
(6000, 2048)
(6000,)


## Dense Model

In [62]:
from models import DenseModel

model = DenseModel(x_train_cell_lines.shape[1], x_train_drugs.shape[1],expr_hlayers_sizes='[625, 312]', drug_hlayers_sizes='[1000, 500, 250]',predictor_hlayers_sizes='[1000]', hidden_dropout=0.3, optimizer='Adam', learn_rate=0.0001)
print(model.summary())

model.train(x_train_cell_lines, x_train_drugs, y_train, epochs=100, batch_size=126)

val_loss, val_mae = model.evaluate(x_val_cell_lines, x_val_drugs, y_val)
print(f'Test Loss: {val_loss}, Test MAE: {val_mae}')

predictions = model.predict(x_test_cell_lines, x_test_drugs)
print(predictions)

None
Epoch 1/100


TypeError: Expected float32, but got ic50_prediction_dense_output of type 'str'.

In [None]:
#plot loss and mae from training and validation data side by side

# plt.figure(figsize=(20, 10))
# plt.subplot(1, 2, 1)
# plt.plot(model.history.history['loss'], label='train')
# plt.plot(model.history.history['val_loss'], label='validation')
# plt.title('Loss')
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.plot(model.history.history['mae'], label='train')
# plt.plot(model.history.history['val_mae'], label='validation')
# plt.title('MAE')
# plt.legend()
# plt.show()

In [None]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# print('Dense model')
# print('MAE: ', mean_absolute_error(y_test, predictions))
# print('MSE: ', mean_squared_error(y_test, predictions))
# print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))
# print('R2: ', r2_score(y_test, predictions))

## Gat Model

In [None]:
output_gat = random_samples['Y']
input_gat= random_samples[['SMILES', 'Gene_expression']]

In [None]:
input_gat

In [None]:
x_train, x_val, y_train, y_val = train_test_split(input_gat, output_gat, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.4, random_state=42)

In [None]:
x_train_drugs = x_train.filter(regex='SMILES', axis=1)
x_train_cell_lines = x_train.iloc[:, x_train.columns.get_loc('SMILES') + 1:]

x_val_drugs = x_val.filter(regex='SMILES', axis=1)
x_val_cell_lines = x_val.iloc[:, x_val.columns.get_loc('SMILES') + 1:]

x_test_drugs = x_test.filter(regex='SMILES', axis=1)
x_test_cell_lines = x_test.iloc[:, x_test.columns.get_loc('SMILES') + 1:]

In [None]:
x_train_drugs

In [None]:
import pandas as pd

# Configurar Pandas para exibir todas as linhas
pd.set_option('display.max_rows', None)

# Exibir a série completa
print(y_train)


In [None]:
from graphfeaturizer import GraphFeaturizer

featurizer_train = GraphFeaturizer()
node_features_train, adjacency_matrix_train = featurizer_train.featurize_df(x_train_drugs, 'SMILES')

featurizer_val = GraphFeaturizer()
node_features_val, adjacency_matrix_val = featurizer_val.featurize_df(x_val_drugs, 'SMILES')

featurizer_test = GraphFeaturizer()
node_features_test, adjacency_matrix_test = featurizer_test.featurize_df(x_test_drugs, 'SMILES')

In [None]:
print("Forma de node_features_train:", node_features_train.shape)
print("Tipo de node_features_train:", type(node_features_train))

print("Forma de adjacency_matrix_train:", adjacency_matrix_train.shape)
print("Tipo de adjacency_matrix_train:", type(adjacency_matrix_train))


In [None]:
import numpy as np

# Verificar valores nulos ou inválidos
print("Valores nulos em node_features_train:", np.isnan(node_features_train).sum())
print("Valores nulos em adjacency_matrix_train:", np.isnan(adjacency_matrix_train).sum())


In [None]:
from models import DrugGATModel


# Example usage

model = DrugGATModel(expr_dim=x_train_cell_lines.shape[1], expr_hlayers_sizes='[156, 156]', drug_gat_layers='[125, 75]',predictor_hlayers_sizes='[125]', hidden_dropout=0.3, optimizer='Adam', learn_rate=0.0001)
print(model.summary())
model.train(x_train_cell_lines, node_features_train, adjacency_matrix_train, y_train, epochs=100, batch_size=64)

val_loss, val_mae = model.evaluate(x_val_cell_lines, node_features_val, adjacency_matrix_val, y_val)
print(f'Test Loss: {val_loss}, Test MAE: {val_mae}')

predictions = model.predict(x_test_cell_lines, node_features_test, adjacency_matrix_test)
print(predictions)

In [None]:
#plot loss and mae from training and validation data side by side
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(model.history.history['loss'], label='train')
plt.plot(model.history.history['val_loss'], label='validation')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model.history.history['mae'], label='train')
plt.plot(model.history.history['val_mae'], label='validation')
plt.title('MAE')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


print('GAT model')
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))
print('R2: ', r2_score(y_test, predictions))