In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Dataset

In [None]:
random_samples=pd.read_pickle(filepath_or_buffer= 'random_samples.pkl')

In [None]:
random_samples.shape


In [None]:
output = random_samples['Y']
input= random_samples[['morgan_fingerprints', 'Gene_expression']]


## Dataset Split

In [None]:
x_train, x_val, y_train, y_val = train_test_split(input, output, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.5, random_state=42)

In [None]:
x_train_drugs = x_train.filter(regex='morgan', axis=1)
x_train_cell_lines = x_train.iloc[:, x_train.columns.get_loc('morgan_fingerprints') + 1:]

x_val_drugs = x_val.filter(regex='morgan', axis=1)
x_val_cell_lines = x_val.iloc[:, x_val.columns.get_loc('morgan_fingerprints') + 1:]

x_test_drugs = x_test.filter(regex='morgan', axis=1)
x_test_cell_lines = x_test.iloc[:, x_test.columns.get_loc('morgan_fingerprints') + 1:]

In [None]:
x_train_drugs = pd.concat([x_train_drugs.drop(['morgan_fingerprints'], axis=1), x_train_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)
x_test_drugs = pd.concat([x_test_drugs.drop(['morgan_fingerprints'], axis=1), x_test_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)
x_val_drugs = pd.concat([x_val_drugs.drop(['morgan_fingerprints'], axis=1), x_val_drugs['morgan_fingerprints'].apply(lambda x: pd.Series({f'fp{str(i)}': val for i, val in enumerate(x)}))], axis=1)

In [None]:
x_train_drugs

In [None]:
x_train_cell = x_train_cell_lines['Gene_expression'].apply(pd.Series)
x_train_cell.columns = [f'gene_{i}' for i in range(x_train_cell.shape[1])]
x_train_cell_lines = pd.concat([x_train_cell_lines.drop(columns=['Gene_expression']), x_train_cell], axis=1)

#####
x_test_cell = x_test_cell_lines['Gene_expression'].apply(pd.Series)
x_test_cell.columns = [f'gene_{i}' for i in range(x_test_cell.shape[1])]
x_test_cell_lines = pd.concat([x_test_cell_lines.drop(columns=['Gene_expression']), x_test_cell], axis=1)

####
x_val_cell = x_val_cell_lines['Gene_expression'].apply(pd.Series)
x_val_cell.columns = [f'gene_{i}' for i in range(x_val_cell.shape[1])]
x_val_cell_lines = pd.concat([x_val_cell_lines.drop(columns=['Gene_expression']), x_val_cell], axis=1)

In [None]:
x_train_cell_lines

In [None]:
x_train_drugs.shape, x_train_cell_lines.shape, y_train.shape

In [None]:
x_test_drugs.shape, x_test_cell_lines.shape, y_test.shape

In [None]:
x_val_drugs.shape, x_val_cell_lines.shape, y_val.shape

In [None]:
y_train

In [None]:
y_val

## Dense Model

In [None]:
from models import DenseModel

model = DenseModel(x_train_cell_lines.values.shape[1], x_train_drugs.values.shape[1],expr_hlayers_sizes='[625, 312]', drug_hlayers_sizes='[1000, 500, 250]',predictor_hlayers_sizes='[1000]', hidden_dropout=0.3, optimizer='Adam', learning_rate=0.0001)
print(model.summary())

model.train(x_train_cell_lines.values, x_train_drugs.values, y_train.values, epochs=100, batch_size=126)

val_loss, val_mae = model.evaluate(x_val_cell_lines.values, x_val_drugs.values, y_val.values)
print(f'Test Loss: {val_loss}, Test MAE: {val_mae}')

predictions = model.predict(x_test_cell_lines.values, x_test_drugs.values)
print(predictions)

In [None]:
#plot loss and mae from training and validation data side by side

plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(model.history.history['loss'], label='train')
plt.plot(model.history.history['val_loss'], label='validation')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model.history.history['mae'], label='train')
plt.plot(model.history.history['val_mae'], label='validation')
plt.title('MAE')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


print('Dense model')
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))
print('R2: ', r2_score(y_test, predictions))

## Gat Model

In [None]:
output_gat = random_samples['Y']
input_gat= random_samples[['SMILES', 'Gene_expression']]

In [None]:
input_gat

In [None]:
x_train, x_val, y_train, y_val = train_test_split(input_gat, output_gat, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.4, random_state=42)

In [None]:
x_train_drugs = x_train.filter(regex='SMILES', axis=1)
x_train_cell_lines = x_train.iloc[:, x_train.columns.get_loc('SMILES') + 1:]

x_val_drugs = x_val.filter(regex='SMILES', axis=1)
x_val_cell_lines = x_val.iloc[:, x_val.columns.get_loc('SMILES') + 1:]

x_test_drugs = x_test.filter(regex='SMILES', axis=1)
x_test_cell_lines = x_test.iloc[:, x_test.columns.get_loc('SMILES') + 1:]

In [None]:
x_train_drugs

In [None]:
from graphfeaturizer import GraphFeaturizer

featurizer_train = GraphFeaturizer()
node_features_train, adjacency_matrix_train = featurizer_train.featurize_df(x_train_drugs, 'SMILES')

featurizer_val = GraphFeaturizer()
node_features_val, adjacency_matrix_val = featurizer_val.featurize_df(x_val_drugs, 'SMILES')

featurizer_test = GraphFeaturizer()
node_features_test, adjacency_matrix_test = featurizer_test.featurize_df(x_test_drugs, 'SMILES')

In [None]:
from models import DrugGATModel


# Example usage

model = DrugGATModel(expr_dim=x_train_cell_lines.shape[1], expr_hlayers_sizes='[156, 156]', drug_gat_layers='[125, 75]',predictor_hlayers_sizes='[125]', hidden_dropout=0.3, optimizer='Adam', learn_rate=0.0001)
print(model.summary())
model.train(x_train_cell_lines, node_features_train, adjacency_matrix_train, y_train, epochs=100, batch_size=64)

val_loss, val_mae = model.evaluate(x_val_cell_lines, node_features_val, adjacency_matrix_val, y_val)
print(f'Test Loss: {val_loss}, Test MAE: {val_mae}')

predictions = model.predict(x_test_cell_lines, node_features_test, adjacency_matrix_test)
print(predictions)

In [None]:
#plot loss and mae from training and validation data side by side
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(model.history.history['loss'], label='train')
plt.plot(model.history.history['val_loss'], label='validation')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model.history.history['mae'], label='train')
plt.plot(model.history.history['val_mae'], label='validation')
plt.title('MAE')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


print('GAT model')
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))
print('R2: ', r2_score(y_test, predictions))