# Preprocess for datasets

In [29]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict

In [2]:
def compute_stats(data):
    # Gather info
    mol_stats_info = {'num_atom': [], 'num_bond': [], 'num_ring': [], 'ring_size': []}
    atom_vocab = []
    
    for smiles in data['SMILES']:
        mol = Chem.MolFromSmiles(smiles)

        rings = Chem.GetSymmSSSR(mol)

        mol_stats_info['num_atom'].append(mol.GetNumAtoms())
        mol_stats_info['num_bond'].append(mol.GetNumBonds())
        mol_stats_info['num_ring'].append(len(rings))
        mol_stats_info['ring_size'].extend([len(ring) for ring in rings]) #+= sum([len(ring) for ring in rings]) / len(rings)

        for atom in mol.GetAtoms():
            atom_vocab.append(atom.GetSymbol())

    # Compute stats for molecular graph
    mol_stats = {key: [np.mean(val), np.std(val)] for key, val in mol_stats_info.items()}
    mol_stats['atom_vocab_size'] = len(set(atom_vocab))
    print('Stats of molecular graph', mol_stats)

    # Compute stats for properties
    print('Stats of HOMO', np.mean(data['HOMO']), np.std(data['HOMO']))
    print('Stats of LUMO', np.mean(data['LUMO']), np.std(data['LUMO']))

## Curated-OPV

### Clean data

In [3]:
data = pd.read_csv('/Users/datqngo/Desktop/projects/ggpm/data/chem_data/cleaned_data.csv')

### Analysis

In [4]:
compute_stats(data)

Stats of molecular graph {'num_atom': [98.73333333333333, 46.627555038720274], 'num_bond': [110.9, 52.258874844374525], 'num_ring': [13.166666666666666, 5.9137316664033905], 'ring_size': [5.50253164556962, 0.4999935907296312], 'atom_vocab_size': 8}
Stats of HOMO -5.50340425531915 0.3903259772197215
Stats of LUMO -3.8730612244897964 0.4658904315509661


## HOPV15

In [5]:
data = pd.read_csv('data/hopv15/hopv15.csv')

In [6]:
compute_stats(data)

Stats of molecular graph {'num_atom': [42.78285714285714, 13.812157909698577], 'num_bond': [49.30285714285714, 16.09542056814006], 'num_ring': [7.525714285714286, 2.596298778441876], 'ring_size': [5.37661351556568, 0.48453666059333345], 'atom_vocab_size': 7}
Stats of HOMO -0.18954285714285712 0.010778928802179231
Stats of LUMO -0.09989428571428571 0.01604966297567189


## QM9

In [7]:
data = pd.read_csv('data/qm9/qm9.csv')

In [None]:
compute_stats(data)

## Motif Vocab

In [3]:
with open('trio_polymer_vocab.txt', 'r') as file:
    motif_list = file.readlines()

In [47]:
import re
from collections import defaultdict

motif_dict = defaultdict(int)
motif_size_dict = defaultdict(int)
attm_pos_list = []

for motif in motif_list:
    motif, attm, _ = motif.split()
    #print(motif, attm)
    if motif not in motif_size_dict:
        motif_size = Chem.MolFromSmiles(motif).GetNumAtoms()
        motif_size_dict[motif] = motif_size

    num_attm_pos = len(re.findall(r':[0-9]+', attm))
    motif_dict[motif] += 1
    
#    if len(motif_dict) > 30: 
#        break

    attm_pos_list.append(max(1, num_attm_pos))

#print(motif_dict)
# states for attachment
print(len(motif_list))
print(np.mean(list(motif_dict.values())), np.std(list(motif_dict.values())))
print(np.mean(attm_pos_list), np.std(attm_pos_list))

# stats for motifs

print(np.mean(list(motif_size_dict.values())), np.std(list(motif_size_dict.values())))

6215
8.619972260748959 13.488160545278527
2.3549477071600964 1.345352559343009
6.160887656033287 1.7865800521672286


In [48]:
any(([0 > x for x in list(motif_dict.values())]))

False

In [49]:
x = Counter(list(motif_dict.values()))
#x = sorted(x.items(), key = lambda x: x[0])
#x = {k: v for k, v in x}

In [51]:
motif_dict

defaultdict(int,
            {'C': 1,
             'C#C': 2,
             'C#CC': 3,
             'C#CC#CC': 3,
             'C#CC(C)=O': 3,
             'C#CC(C)C': 4,
             'C#CC(C)O': 3,
             'C#CCC': 3,
             'C#CCCC': 3,
             'C#CCN': 3,
             'C#N': 3,
             'C1#CC=CCC=CC=C1': 1,
             'C1#CC=CCCCC=C1': 1,
             'C1#CC=CCOCC=C1': 1,
             'C1#CCC=CC=CC=C1': 1,
             'C1#CCC=CCC=CC1': 1,
             'C1#CCC=CCCC=C1': 1,
             'C1#CCCC=CC=CC1': 1,
             'C1#CCCC=CCC=C1': 1,
             'C1#CCCCC=CC=C1': 1,
             'C1#CCCCC=CCC1': 1,
             'C1#CCCCCC=CC1': 1,
             'C1#CCCCCCC=C1': 1,
             'C1#CCCCCCCC1': 1,
             'C1#CCCCOCC=C1': 1,
             'C1#CCCCOCCC1': 1,
             'C1#CCCOC=NCC1': 1,
             'C1#CCCOCC=CC1': 1,
             'C1#CCCOCCC=C1': 1,
             'C1#CCCOCCCC1': 1,
             'C1#CCCOCOCC1': 1,
             'C1#CCN=CNC=NC1': 1,
   

In [54]:
max(x)

124

In [58]:
[motif for motif in motif_list if 'C1CCNCC1' in motif]

['C1CCNCC1 C1CC[CH2:1]NC1 False\n',
 'C1CCNCC1 C1CC[CH2:2][NH:1]C1 False\n',
 'C1CCNCC1 C1CC[NH:1]CC1 False\n',
 'C1CCNCC1 C1CC[NH:1][CH2:1]C1 False\n',
 'C1CCNCC1 C1CC[NH:2][CH2:1]C1 False\n',
 'C1CCNCC1 C1CNC[CH2:1]C1 False\n',
 'C1CCNCC1 C1CN[CH2:1][CH2:1]C1 False\n',
 'C1CCNCC1 C1CN[CH2:1][CH2:2]C1 False\n',
 'C1CCNCC1 C1CN[CH2:2][CH2:1]C1 False\n',
 'C1CCNCC1 C1C[CH2:1]CCN1 False\n',
 'C1CCNCC1 C1C[CH2:1]C[CH2:2]N1 False\n',
 'C1CCNCC1 C1C[CH2:1]C[NH:2]C1 False\n',
 'C1CCNCC1 C1C[CH2:1]N[CH2:2]C1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:1][CH2:1]N1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:1][CH2:2]N1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:1][NH:1]C1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:2]CN1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:2][CH2:2]N1 False\n',
 'C1CCNCC1 C1C[CH2:1][CH2:2][NH:2]C1 False\n',
 'C1CCNCC1 C1C[CH2:1][NH:1][CH2:2]C1 False\n',
 'C1CCNCC1 C1C[CH2:1][NH:2][CH2:2]C1 False\n',
 'C1CCNCC1 C1C[CH2:2]CC[NH:1]1 False\n',
 'C1CCNCC1 C1C[CH2:2]CN[CH2:1]1 False\n',
 'C1CCNCC1 C1C[CH2:2]C[CH2