In [1]:
# imports
import pandas as pd
import os
import seaborn as sns
import numpy as np

from scipy import stats

import matplotlib.pyplot as plt
import matplotlib

from rdkit import Chem
from rdkit.Chem import AllChem

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/Users/reesaespera/Desktop/tjl/Non-Fullerene Dataset/data/1-s2.0-S2542435117301307-mmc2.csv')

In [3]:
import rdkit.Chem.Descriptors as d

df['Molecular Weight'] = df['smiles'].apply(lambda x: d.ExactMolWt(Chem.MolFromSmiles(x)))
df['Heavy Atom Molecular Weight'] = df['smiles'].apply(lambda x: d.HeavyAtomMolWt(Chem.MolFromSmiles(x)))
df["Max Abs Partial Charge"] = df['smiles'].apply(lambda x: d.MaxAbsPartialCharge(Chem.MolFromSmiles(x)))
df["Max Partial Charge"] = df['smiles'].apply(lambda x: d.MaxPartialCharge(Chem.MolFromSmiles(x)))
df["Min Abs Partial Charge"] = df['smiles'].apply(lambda x: d.MinAbsPartialCharge(Chem.MolFromSmiles(x)))
df["Min Partial Charge"] = df['smiles'].apply(lambda x: d.MinPartialCharge(Chem.MolFromSmiles(x)))
df["Molecular Weight"] = df['smiles'].apply(lambda x: d.MolWt(Chem.MolFromSmiles(x)))
df["Radical Electrons"] = df['smiles'].apply(lambda x: d.NumRadicalElectrons(Chem.MolFromSmiles(x)))
df["Valence Electrons"] = df['smiles'].apply(lambda x: d.NumValenceElectrons(Chem.MolFromSmiles(x)))
df["NHOH Count"] = df['smiles'].apply(lambda x: d.NHOHCount(Chem.MolFromSmiles(x)))
df["NO Count"] = df['smiles'].apply(lambda x: d.NOCount(Chem.MolFromSmiles(x)))
df["H Acceptors"] = df['smiles'].apply(lambda x: d.NumHAcceptors(Chem.MolFromSmiles(x)))
df["H Donors"] = df['smiles'].apply(lambda x: d.NumHDonors(Chem.MolFromSmiles(x)))
df["Ring Count"] = df['smiles'].apply(lambda x: d.RingCount(Chem.MolFromSmiles(x)))
df["Aliphatic Rings"] = df['smiles'].apply(lambda x: d.NumAliphaticRings(Chem.MolFromSmiles(x)))
df["Aromatic Rings"] = df['smiles'].apply(lambda x: d.NumAromaticRings(Chem.MolFromSmiles(x)))
df["Saturated Rings"] = df['smiles'].apply(lambda x: d.NumSaturatedRings(Chem.MolFromSmiles(x)))
df["Aromatic Carbocycles"] = df['smiles'].apply(lambda x: d.NumAromaticCarbocycles(Chem.MolFromSmiles(x)))
df["Aromatic Heterocycles"] = df['smiles'].apply(lambda x: d.NumAromaticHeterocycles(Chem.MolFromSmiles(x)))
df["Heteroatoms"] = df['smiles'].apply(lambda x: d.NumHeteroatoms(Chem.MolFromSmiles(x)))
df["Rotatable Bonds"] = df['smiles'].apply(lambda x: d.NumRotatableBonds(Chem.MolFromSmiles(x)))
df["Saturated Carbocycles"] = df['smiles'].apply(lambda x: d.NumSaturatedCarbocycles(Chem.MolFromSmiles(x)))
df["Saturated Heterocycles"] = df['smiles'].apply(lambda x: d.NumSaturatedHeterocycles(Chem.MolFromSmiles(x)))

In [4]:
from mordred import AtomCount
from mordred import BondCount

df["H Count"] = df['smiles'].apply(lambda x: AtomCount.AtomCount('H')(Chem.MolFromSmiles(x)))
df["C Count"] = df['smiles'].apply(lambda x: AtomCount.AtomCount('C')(Chem.MolFromSmiles(x)))
df["N Count"] = df['smiles'].apply(lambda x: AtomCount.AtomCount('N')(Chem.MolFromSmiles(x)))
df["F Count"] = df['smiles'].apply(lambda x: AtomCount.AtomCount('F')(Chem.MolFromSmiles(x)))
df["Halogen Count"] = df['smiles'].apply(lambda x: AtomCount.AtomCount('X')(Chem.MolFromSmiles(x)))
df["Double Bonds"] = df['smiles'].apply(lambda x: BondCount.BondCount('double', False)(Chem.MolFromSmiles(x)))
df["Triple Bonds"] = df['smiles'].apply(lambda x: BondCount.BondCount('triple', False)(Chem.MolFromSmiles(x)))

In [5]:
df = df.drop(columns=['index','inchikey', 'GAP_calib', 'molW', 'LUMO_calib','LUMO_calib_stds','HOMO_calib','HOMO_calib_stds','GAP_calc','PCE_calc','Voc_calc','Jsc_calc','FF_calc','EQE_calc','PCE_calib','Voc_calib','Jsc_calib','FF_calib','EQE_calib','PCE_cdiff','PCE_calib_plus'])
df.to_csv('NF_Descriptors.csv')

In [6]:
df

Unnamed: 0,smiles,HOMO_calc,LUMO_calc,Molecular Weight,Heavy Atom Molecular Weight,Max Abs Partial Charge,Max Partial Charge,Min Abs Partial Charge,Min Partial Charge,Radical Electrons,...,Rotatable Bonds,Saturated Carbocycles,Saturated Heterocycles,H Count,C Count,N Count,F Count,Halogen Count,Double Bonds,Triple Bonds
0,CN1C(=O)C(=Cc2occc2C(=O)c2ccc(-c3ncc(C4=CC=CC5...,-6.236230,-3.127170,588.552,570.408,0.464170,0.332907,0.332907,-0.464170,0,...,5,0,1,18,28,6,2,2,9,0
1,CN1C(=O)C(=CC2=CC=C(c3cnc(-c4ccc(C(=O)c5ccoc5)...,-6.339430,-3.704750,588.552,570.408,0.471837,0.332907,0.332907,-0.471837,0,...,5,0,1,18,28,6,2,2,9,0
2,CN1C(=O)C(=Cc2ncc(-c3ccc(C(=O)c4cocc4C=Cc4ccnc...,-6.406977,-3.581290,527.522,510.386,0.471186,0.332907,0.332907,-0.471186,0,...,6,0,1,17,25,7,0,0,6,0
3,CN1C(=O)C(=Cc2nccc(C=Cc3cocc3C(=O)c3ccc(-c4cnc...,-6.438564,-3.095938,527.522,510.386,0.471186,0.332907,0.332907,-0.471186,0,...,6,0,1,17,25,7,0,0,6,0
4,CN1C(=O)C(=Cc2sc(C#N)c(-c3ncc(-c4ccsc4C(=O)c4c...,-6.859940,-3.555118,554.569,540.457,0.471837,0.332907,0.332907,-0.471837,0,...,5,0,1,14,26,6,0,0,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51251,CN1C(=O)C(=Cc2ccsc2C(=O)c2cocc2-c2ccnc(-c3scc4...,-5.658348,-3.153557,594.721,576.577,0.471102,0.332907,0.332907,-0.471102,0,...,5,0,1,18,26,4,0,0,5,0
51252,CN1C(=O)C(=Cc2c(F)cc(-c3c(-c4ccncn4)sc4ccc(C#N...,-6.630212,-3.389053,555.576,541.464,0.332907,0.332907,0.268330,-0.268330,0,...,3,0,1,14,26,7,1,1,4,1
51253,CN1C(=O)C(=Cc2nccc(-c3sc4ccc(C#N)cc4c3-c3cc(F)...,-6.634797,-2.933484,555.576,541.464,0.332907,0.332907,0.268328,-0.268328,0,...,3,0,1,14,26,7,1,1,4,1
51254,CN1C(=O)C(=Cc2ccc3nc(-c4sc5ccc(C#N)cc5c4-c4ccc...,-6.471826,-3.272118,559.563,542.427,0.443458,0.332907,0.332907,-0.443458,0,...,3,0,1,17,30,5,0,0,4,1
