In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
import matplotlib
import rdkit.Chem as chem
from collections import defaultdict

In [6]:
with open("HOPV_15_revised_2.data", 'r') as data:
    m = 0 # number of molecules
    lines = data.readlines()
    smiles = []
    homo = []
    lumo = []
    count = 0

    for line in lines:
        count += 1
    while m < count:
        smiles.append(lines[m])
        num_atom = int(lines[m + 6]) # number of atoms
        num_conf = int(lines[m + 4])
        m = m + 5 + (num_atom + 6) * num_conf
        if (m > count):
            break
        csv = lines[m - 3]
        info = csv.split(",")
        homo.append(info[1])
        lumo.append(info[2])
        

    dict = {'SMILES' : smiles, 'HOMO' : homo, 'LUMO' : lumo}
    df = pd.DataFrame(dict)
    df.to_csv('Homo_Lumo_Data.csv')

                                                SMILES    HOMO    LUMO
0    Cc1ccc(-c2c3cc(-c4ccc(-c5sc(-c6cccs6)c6cc(S(C)...   -0.17  -0.118
1    CN1c2ccccc2C(=C2c3ccc(-c4cccs4)cc3N(C)C2=O)C1=O\n  -0.183  -0.135
2    Cc1cc(-c2cc3c4nsnc4c(-c4cc(C)c(-c5cccs5)s4)cc3...  -0.173  -0.133
3    Cn1c2ccccc2c2ccc(-c3ccc(-c4ccc(C5=C(c6ccccc6)C...  -0.158  -0.112
4    Cc1cc(-c2c3cc(-c4cccs4)sc3c(-c3cc(C)c(C)s3)c3c...   -0.17  -0.137
..                                                 ...     ...     ...
345  CC1(C)c2ccccc2-c2ccc(-c3ccc(C4=C(c5ccccc5)C(c5...  -0.164  -0.106
346  Cc1ccc(-c2c3ccsc3c(-c3ccc(C)s3)c3cc(-c4ccc(C5=...  -0.167  -0.124
347  C[Si]1(C)c2ccccc2-c2ccc(-c3ccc(-c4ccc(-c5cccs5...  -0.176  -0.126
348  N#CC(=Cc1ccc(C=Cc2ccc(-c3ccc(N(c4ccccc4)c4cccc...  -0.176  -0.131
349  C=Cc1cc(OC)c(C=Cc2ccc(N(c3ccccc3)c3ccc(C=C(C#N...  -0.174  -0.129

[350 rows x 3 columns]


In [16]:
import rdkit.Chem.Descriptors as d

df['Molecular Weight'] = df['SMILES'].apply(lambda x: d.ExactMolWt(chem.MolFromSmiles(x)))
df['Heavy Atom Molecular Weight'] = df['SMILES'].apply(lambda x: d.HeavyAtomMolWt(chem.MolFromSmiles(x)))
df["Max Absolute Partial Charge"] = df['SMILES'].apply(lambda x: d.MaxAbsPartialCharge(chem.MolFromSmiles(x)))
df["Max Partial Charge"] = df['SMILES'].apply(lambda x: d.MaxPartialCharge(chem.MolFromSmiles(x)))
df["Min Abs Partial Charge"] = df['SMILES'].apply(lambda x: d.MinAbsPartialCharge(chem.MolFromSmiles(x)))
df["Min Partial Charge"] = df['SMILES'].apply(lambda x: d.MinPartialCharge(chem.MolFromSmiles(x)))
df["Molecular Weight"] = df['SMILES'].apply(lambda x: d.MolWt(chem.MolFromSmiles(x)))
df["Radical Electrons"] = df['SMILES'].apply(lambda x: d.NumRadicalElectrons(chem.MolFromSmiles(x)))
df["Valence Electrons"] = df['SMILES'].apply(lambda x: d.NumValenceElectrons(chem.MolFromSmiles(x)))
df["NHOH Count"] = df['SMILES'].apply(lambda x: d.NHOHCount(chem.MolFromSmiles(x)))
df["NO Count"] = df['SMILES'].apply(lambda x: d.NOCount(chem.MolFromSmiles(x)))
df["H Acceptors"] = df['SMILES'].apply(lambda x: d.NumHAcceptors(chem.MolFromSmiles(x)))
df["H Donors"] = df['SMILES'].apply(lambda x: d.NumHDonors(chem.MolFromSmiles(x)))
df["Ring Count"] = df['SMILES'].apply(lambda x: d.RingCount(chem.MolFromSmiles(x)))
df["Aliphatic Rings"] = df['SMILES'].apply(lambda x: d.NumAliphaticRings(chem.MolFromSmiles(x)))
df["Aromatic Rings"] = df['SMILES'].apply(lambda x: d.NumAromaticRings(chem.MolFromSmiles(x)))
df["Saturated Rings"] = df['SMILES'].apply(lambda x: d.NumSaturatedRings(chem.MolFromSmiles(x)))
df["Aromatic Carbocycles"] = df['SMILES'].apply(lambda x: d.NumAromaticCarbocycles(chem.MolFromSmiles(x)))
df["Aromatic Heterocycles"] = df['SMILES'].apply(lambda x: d.NumAromaticHeterocycles(chem.MolFromSmiles(x)))
df["Heteroatoms"] = df['SMILES'].apply(lambda x: d.NumHeteroatoms(chem.MolFromSmiles(x)))
df["Rotatable Bonds"] = df['SMILES'].apply(lambda x: d.NumRotatableBonds(chem.MolFromSmiles(x)))
df["Saturated Carbocycles"] = df['SMILES'].apply(lambda x: d.NumSaturatedCarbocycles(chem.MolFromSmiles(x)))
df["Saturated Heterocycles"] = df['SMILES'].apply(lambda x: d.NumSaturatedHeterocycles(chem.MolFromSmiles(x)))

In [20]:
from mordred import AtomCount
from mordred import BondCount

df["H Count"] = df['SMILES'].apply(lambda x: AtomCount.AtomCount('H')(chem.MolFromSmiles(x)))
df["C Count"] = df['SMILES'].apply(lambda x: AtomCount.AtomCount('C')(chem.MolFromSmiles(x)))
df["N Count"] = df['SMILES'].apply(lambda x: AtomCount.AtomCount('N')(chem.MolFromSmiles(x)))
df["F Count"] = df['SMILES'].apply(lambda x: AtomCount.AtomCount('F')(chem.MolFromSmiles(x)))
df["Halogen Count"] = df['SMILES'].apply(lambda x: AtomCount.AtomCount('X')(chem.MolFromSmiles(x)))
df["Double Bonds"] = df['SMILES'].apply(lambda x: BondCount.BondCount('double', False)(chem.MolFromSmiles(x)))
df["Triple Bonds"] = df['SMILES'].apply(lambda x: BondCount.BondCount('triple', False)(chem.MolFromSmiles(x)))

In [21]:
df.to_csv('Full_Descriptors.csv')

In [22]:
df

Unnamed: 0,SMILES,HOMO,LUMO,Molecular Weight,Heavy Atom Molecular Weight,Max Absolute Partial Charge,Max Partial Charge,Min Abs Partial Charge,Min Partial Charge,Radical Electrons,...,Rotatable Bonds,Saturated Carbocycles,Saturated Heterocycles,H Count,C Count,N Count,F Count,Halogen Count,Double Bonds,Triple Bonds
0,Cc1ccc(-c2c3cc(-c4ccc(-c5sc(-c6cccs6)c6cc(S(C)...,-0.17,-0.118,763.162,740.986,0.223158,0.184422,0.184422,-0.223158,0,...,6,0,0,22,35,0,0,0,2,0
1,CN1c2ccccc2C(=C2c3ccc(-c4cccs4)cc3N(C)C2=O)C1=O\n,-0.183,-0.135,372.449,356.321,0.310764,0.259055,0.259055,-0.310764,0,...,1,0,0,16,22,2,0,0,3,0
2,Cc1cc(-c2cc3c4nsnc4c(-c4cc(C)c(-c5cccs5)s4)cc3...,-0.173,-0.133,600.866,584.738,0.172251,0.113593,0.113593,-0.172251,0,...,4,0,0,16,28,4,0,0,0,0
3,Cn1c2ccccc2c2ccc(-c3ccc(-c4ccc(C5=C(c6ccccc6)C...,-0.158,-0.112,770.158,734.878,0.343591,0.117083,0.117083,-0.343591,0,...,7,0,0,35,47,1,0,0,2,0
4,Cc1cc(-c2c3cc(-c4cccs4)sc3c(-c3cc(C)c(C)s3)c3c...,-0.17,-0.137,899.194,866.938,0.308118,0.259140,0.259140,-0.308118,0,...,5,0,0,32,48,2,2,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,CC1(C)c2ccccc2-c2ccc(-c3ccc(C4=C(c5ccccc5)C(c5...,-0.164,-0.106,618.943,584.671,0.144560,0.117083,0.117083,-0.144560,0,...,5,0,0,34,41,0,0,0,2,0
346,Cc1ccc(-c2c3ccsc3c(-c3ccc(C)s3)c3cc(-c4ccc(C5=...,-0.167,-0.124,709.002,684.810,0.309143,0.260649,0.260649,-0.309143,0,...,5,0,0,24,36,2,0,0,4,0
347,C[Si]1(C)c2ccccc2-c2ccc(-c3ccc(-c4ccc(-c5cccs5...,-0.176,-0.126,508.769,488.609,0.172222,0.113599,0.113599,-0.172222,0,...,3,0,0,20,28,2,0,0,0,0
348,N#CC(=Cc1ccc(C=Cc2ccc(-c3ccc(N(c4ccccc4)c4cccc...,-0.176,-0.131,530.674,508.498,0.477114,0.345992,0.345992,-0.477114,0,...,8,0,0,22,32,2,0,0,3,1
