In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Summary MERGED CSV
data = pd.read_csv('Molecular_Properties_CSV')
data.shape

(7626, 11)

In [45]:
#Taking Dataframe of only Inactive values
###Drop all active agonists and active antagonists
df_inactive = data.drop(data.loc[data['Activity Summary'] == 'active agonist'].index)
df_inactive = df_inactive.drop(df_inactive.loc[df_inactive['Activity Summary'] == 'active antagonist'].index)

#Taking Dataframe of only Active values
##Dropping all inactive values
df_active = data.drop(data.loc[data['Activity Summary'] == 'inactive'].index)

In [46]:
#Printing number of active compounds
print("Number of active compounds in data: ")
print(len(df_active.index))

Number of active compounds in data: 
647


In [47]:
df_inactive.head()

Unnamed: 0,PUBCHEM_CID,Activity Summary,Ratio Potency (uM),Ratio Efficacy (%),MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,InChI,ExactMass,TPSA
1,197883.0,inactive,0.0,0.0,C4H6O4S2,182.208,C(C(C(=O)O)S)(C(=O)O)S,[C@H]([C@@H](C(=O)O)S)(C(=O)O)S,"InChI=1S/C4H6O4S2/c5-3(6)1(9)2(10)4(7)8/h1-2,9...",181.971,76.6
2,441358.0,inactive,0.0,0.0,C19H22ClN,299.842,CNCCC=C1C2=CC=CC=C2CCC3=CC=CC=C31.Cl,CNCCC=C1C2=CC=CC=C2CCC3=CC=CC=C31.Cl,InChI=1S/C19H21N.ClH/c1-20-14-6-11-19-17-9-4-2...,299.144,12.0
5,2726.0,inactive,0.0,0.0,C17H19ClN2S,318.863,CN(C)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,CN(C)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl,InChI=1S/C17H19ClN2S/c1-19(2)10-5-11-20-14-6-3...,318.096,31.8
10,4044.0,inactive,0.0,0.0,C15H15NO2,241.29,CC1=C(C(=CC=C1)NC2=CC=CC=C2C(=O)O)C,CC1=C(C(=CC=C1)NC2=CC=CC=C2C(=O)O)C,InChI=1S/C15H15NO2/c1-10-6-5-9-13(11(10)2)16-1...,241.11,49.3
11,6103.0,inactive,0.0,0.0,C7H5ClN2O,168.58,C1=CC2=C(C=C1Cl)N=C(O2)N,C1=CC2=C(C=C1Cl)N=C(O2)N,InChI=1S/C7H5ClN2O/c8-4-1-2-6-5(3-4)10-7(9)11-...,168.009,52.0


In [48]:
df_inactive.dtypes

PUBCHEM_CID           float64
Activity Summary       object
Ratio Potency (uM)    float64
Ratio Efficacy (%)    float64
MolecularFormula       object
MolecularWeight       float64
CanonicalSMILES        object
IsomericSMILES         object
InChI                  object
ExactMass             float64
TPSA                  float64
dtype: object

In [56]:
#Keeping only float.int values
##ALSO drop pubchem_CID because thats not a feature
df_numerical = df_inactive.drop(['PUBCHEM_CID', 'Activity Summary', 'MolecularFormula', 'CanonicalSMILES', 'IsomericSMILES', 'InChI'], axis = 1)
df_numerical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6979 entries, 1 to 7160
Data columns (total 5 columns):
Ratio Potency (uM)    6979 non-null float64
Ratio Efficacy (%)    6979 non-null float64
MolecularWeight       6979 non-null float64
ExactMass             6979 non-null float64
TPSA                  6979 non-null float64
dtypes: float64(5)
memory usage: 327.1 KB


# PCA

In [61]:
#normalizing data 
df_numerical = StandardScaler().fit_transform(df_numerical)

#computing covariance matrix
cov_mat = np.cov(df_numerical.T)

#computing eigenvalues and eigenvectors using numpy
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

#Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

#Sort the eigehvalue, eigenvector) tuples from high to low
eig_pairs.sort(key = lambda x: x[0], reverse = True)

In [68]:
#Calculating "explained variance percentage"
##Quantifies how much info/variance can be attributed to each of the principal components
#only keeping a certain number of eigenvalues based on prespecified threshold

exp_var_percentage = 0.50 # Threshold of 97% explained variance

tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

num_vec_to_keep = 0

for index, percentage in enumerate(cum_var_exp):
    if percentage > exp_var_percentage:
        num_vec_to_keep = index + 1
        break

In [69]:
#Project our data on the vectors we want to keep
#Build a projection nmatrix (matrix we will multiply by to project our data onto the new vectors)

num_features = df_numerical.shape[1]
proj_mat = eig_pairs[0][1].reshape(num_features,1)
for eig_vec_idx in range(1, num_vec_to_keep):
    proj_mat = np.hstack((proj_mat, eig_pairs[eig_vec_idx][1].reshape(num_features,1)))

# Project the data 
pca_data = df_numerical.dot(proj_mat)

In [70]:
pca_data

array([[ 0.54530597],
       [ 0.2284907 ],
       [-0.05764222],
       ...,
       [-4.27597644],
       [-4.9198135 ],
       [-3.5380351 ]])

In [50]:
#KMEANS
X = np.array(df_numerical)
y = np.array
kmeans = KMeans(n_clusters = 647)
kmeans.fit(X)

NameError: name 'X' is not defined