<a href="https://colab.research.google.com/github/russodanielp/intro_cheminformatics/blob/google_colab/Lab%20XX%20-%20Cheminformatics%20Tools/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Principal Component Analysis

## Script variables

Script variables that need to be changed are in the script below.  The script requires that you provide four pieces of information.  

1) `SDFILE_DIR`: the filepath to the SDFile containing the chemicals to build a ML model  
2) `ACTIVITY_COLUMN`: the name of the column/property in the SDFile that contains the activity you would like to color 
3) `NAME_COLUMN`: the name of the column/property in the SDFile that contains the name or identifier of the molecule  
4) `ONE_COLOR`: the color for the "one" activity   
5) `ZERO_COLOR`: the color for the "zero" activity  

The acceptable colors are:
```
'aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen'
```

In [70]:
SDFILE_DIR = 'Training.sdf'
ACTIVITY_COLUMN = 'Composite category'
NAME_COLUMN = 'CASRN '
ONE_COLOR = 'red'
ZERO_COLOR = 'blue'

In [43]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Imports and Code

In [44]:
from rdkit import Chem 
from rdkit.Chem import PandasTools
import pandas as pd
import plotly.express as px
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import pandas as pd
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib import colors

desc_set = ['MolWt', 'TPSA', 'NumRotatableBonds', 'NumHDonors', 'NumHAcceptors', 'MolLogP']

def calc_descriptors_from_frame(df: pd.DataFrame, scale=True, desc_set=None) -> pd.DataFrame:
    """ calculates rdkit descriptors from a smiles.txt file """

    if desc_set:
        desc_set = [desc[0] for desc in Descriptors.descList if desc[0] in desc_set]
    else:
        desc_set = [desc[0] for desc in Descriptors.descList]

    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_set)

    X = pd.DataFrame([list(calc.CalcDescriptors(mol)) for mol in df['ROMol']],
                     columns=list(calc.GetDescriptorNames()), index=df[NAME_COLUMN])
    X = X.loc[X.notnull().all(1), :]

    if scale:
        X = pd.DataFrame(StandardScaler().fit_transform(X), index=X.index, columns=X.columns)
    return X

## PCA

In [64]:
df = PandasTools.LoadSDF(SDFILE_DIR)
descriptors = calc_descriptors_from_frame(df, desc_set=desc_set)
pca = PCA()

latent_space = pd.DataFrame(pca.fit_transform(descriptors)[:, 0:3])
latent_space.columns = ['PC1', 'PC2', 'PC3']
latent_space['Class'] = df[ACTIVITY_COLUMN]
latent_space[NAME_COLUMN] = df[NAME_COLUMN]

## Colors

In [67]:
r_1, g_1, b_1 =  colors.to_rgb(ONE_COLOR.lower())
r_1, g_1, b_1 = r_1*255, g_1*255, b_1*255

r_0, g_0, b_0 =  colors.to_rgb(ZERO_COLOR.lower())
r_0, g_0, b_0 = r_0*255, g_0*255, b_0*255

## Plot

In [68]:
fig = px.scatter_3d(latent_space,
                  x='PC1',
                  y='PC2',
                  z='PC3',
                  color='Class',
                  hover_name=NAME_COLUMN,
                  hover_data=['Class', 'PC1', 'PC2', 'PC3'],
                  labels={
                      "PC1": "PC1 ({:.2%})".format(pca.explained_variance_ratio_[0]),
                      "PC2": "PC2 ({:.2%})".format(pca.explained_variance_ratio_[1]),
                      "PC3": "PC3 ({:.2%})".format(pca.explained_variance_ratio_[2]),
                  },
                height=800, # default height
                color_discrete_map={
                    '1': "rgba({}, {}, {}, 0.5)".format(r_1, g_1, b_1 ),
                    '0': "rgba({}, {}, {}, 0.5)".format(r_0, g_0, b_0)
                    }
                  )

fig.update_layout(template='plotly_white',
                  scene=dict(aspectratio=dict(x=1, y=1, z=1))
                  )
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
#fig.update_zaxes(showline=True, linewidth=2, linecolor='black')
fig.update_traces(marker=dict(size=6,
                              opacity=0.5,
                              #line=dict(width=2, color='DarkSlateGrey')
                              ),
                  )

camera = dict(
    eye=dict(x=0., y=2.5, z=0),
)

fig.update_layout(scene_camera=camera, scene_dragmode='orbit')
fig.show()