<a href="https://colab.research.google.com/github/porekhov/drug_design_2024/blob/main/QSAR_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# reading a dataset with skin permeability coefficients
# data taken from https://www.nature.com/articles/s41598-021-89587-5#MOESM1
df = pd.read_csv('logKp_dataset.csv', sep = '\t')

In [None]:
logp, logkp = df['logP'], df['logKp']

In [None]:
# 1D case: correlation between skin permeability and logP
slope, intercept = np.polyfit(logp, logkp, 1)
corr, pval = pearsonr(logp, logkp)

fig1 = plt.figure()
ax = fig1.add_subplot(111)

plt.scatter(logp, logkp, marker = 'o', c = 'r')

plt.plot(np.array([-4, 4]), slope*np.array([-4, 4]) + intercept, color='k',
         ls =':', label = 'ρ = '+str(round(corr,2)))

plt.xlabel('logP')
plt.ylabel('logKp')
plt.legend()
plt.show()

In [None]:
# 3D case: adding 2 more descriptors to the linear model: MW and TPSA

mw, tpsa = df['MW'], df['TPSA']

params = np.vstack((logp, mw, tpsa, np.ones(20))).T
print(params)
# finding the least-square solution
koefs = np.linalg.lstsq(params, logkp, rcond=None)[0]
print(koefs)
# Calculating prediction based on the linear regression model
logkp_pred = np.dot(params, koefs)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(logkp, logkp_pred, marker = '^', color = 'g', lw = 0, ms = 10)
ax.plot([-5, 0], [-5, 0], lw = 3, color = 'gray')

ax.set_xlabel('Model data')
ax.set_ylabel('Experimental data')

plt.show()

In [None]:
# mean square error
print(np.sqrt(np.mean((logkp_pred - logkp)**2)))
# Pearson’s correlation coefficient can be calculated directly
print(np.sum((logkp - np.mean(logkp)) * (logkp_pred - np.mean(logkp_pred)))/(np.sqrt(np.sum((logkp - np.mean(logkp))**2))*np.sqrt(np.sum((logkp_pred - np.mean(logkp_pred))**2))))
# using Scipy
from scipy.stats import pearsonr
# also returns p-value that this correlation appeared by chance
print(pearsonr(logkp_pred, logkp))
# using Numpy
print(np.corrcoef(logkp_pred, logkp))