# IUPAC to SMILES converter


In [2]:
# Imports, if any
from pprint import pprint

In [3]:
# Abstract intermediate form, more easily convertible to SMILES

FN = {
    'pref':{
        'nitro': '[N+](=O)[O-]',
        'hydroxy': 'O',
        'cyano': 'C#N',
        'oxo': '=O',
        'chloro': 'Cl',
        'bromo': 'Br',
        'fluoro': 'F'
    }
}

class Chain:
    def __init__(self, length=1, props=[], cyclic=False, yl=1):
        self.len = length
        self.prop={}
        for pair in props:
            if pair[0] in self.prop:
                self.prop[pair[0]].append(pair[1])
            else:
                self.prop[pair[0]] = ChemArray([pair[1]])

        self.is_cyclic = cyclic
        self.yl = yl

    def formula(self, index = 1):
        f = []
        for pos in range(1,self.len+1):
            curr = "c" if self.is_cyclic == 2 else "C"
            if self.is_cyclic and pos == 1:
                curr += str(index)
            if pos in self.prop:
                for item in self.prop[pos].arr:

                    i = index + 1 if self.is_cyclic else index
                    formula = item.formula(index = i)
                    curr += formula
            f.append(curr)
        if self.yl > 1:
            yl = self.yl - 1
            f = f"{f[yl]}({''.join(f[yl + 1:])}){''.join(f[:yl])}"
        strf = "".join(f)
        
        return f"({ strf + str(index) if self.is_cyclic else strf })"

    def __str__(self):
        return "Chain: [" + self.formula() + '] ' + str(self.prop)

class Fngp:
    def __init__(self, formula):
        self._formula = formula

    def formula(self, index):
        if self._formula in ['=','#']:
            return self._formula
        else:
            return f"({self._formula})"

    def __repr__(self):
        return f"FnGroup: [{self.formula()}]"

class ChemArray:
    def __init__(self, array):
        self.arr = array
        self._arr = list(map(
            lambda a: a.formula(index = 0),
            array
        ))

    def append(self, item):
        self.arr.append(item)
        self._arr.append(item.formula())

    def formula(self, index):
        return ''.join(self._arr)
    def __repr__(self):
        return ';'.join(self._arr)

In [37]:
# Converts IUPAC name to intermediate format
# Particularly confusing since names are supposed to be human-readable
# and because of my inherent lack of ability to program


format = lambda text: text.replace("\n", " ").split(" ")
WORDS = {
	"nuclear": format(
		"""
meth eth prop but pent hex hept oct non dec
benzen
"""
	),
	"prefix": format(
		"""
yl ylidene N
oxy
amino nitro cyano oxo
chloro fluro bromo iodo
phen
cyclo
"""
	),
	"suffix": format(
		"""
ane ene yne
an en yn
ne n
ol al one oicacid oate carboxylicacid carbaldehyde amide amine
nitrile
mine mide
"""
	),
	"common": format(
		"""
pent hex hept oct non dec
di tri tetr
a e
1 2 3 4 5 6 7 8 9
( [ , ] )
"""
	),
	"trueignore": format(
		"""
an ane di tri tetra -
"""
	),
}


class Word:
	def __init__(self, val, type, cyclic: int = 0):
		self.val = val
		self.type = type
		self.cyclic = cyclic

	def __repr__(self):
		return f"{self.val} ({self.type})"


def wordlist(types):
	l = []
	for n in types:
		l.extend(WORDS[n])
	return l


def splitWords(input):
	def lookahead(index, sub, main):
		for i in range(len(sub)):
			if main[index + i : index + i + 1] != sub[i]:
				return False
		return True

	def best_lookahead(list, index, main):
		max = ""
		for item in list:
			if lookahead(index, item, main) and len(item) > len(max):
				max = item
		return max

	txt = input.replace(" ", "")  # spaces are for simpletons
	res = []
	i = 0
	while i < len(txt):
		bl = best_lookahead(wordlist(["prefix", "suffix", "nuclear", "common", "trueignore"]), i, txt)
		if not bl:
			raise ValueError(
				"Unrecognized sequence: '" + txt[i : i + 7] + "...' at index " + str(i)
			)
		res.append(bl)
		i += len(bl)
	return res


# Warning: highly unusual and esoteric code
def categorize(words):
	WORDS["ignore"] = set(wordlist(["common"])).difference(wordlist(["nuclear"]))

	# Get types of each word
	l2 = []
	for word in words:
		l2.append(Word(word, "??"))
		for type in WORDS:
			if word in WORDS[type]:
				l2[-1].type = type

	# Seal off bracketed substituents
	l3 = []
	phase = "out"
	for word in l2:
		if word.val == "(":
			phase = "in"
			l3.append(Word([], "list"))
		elif word.val == ")":
			phase = "out"
		elif phase == "out":
			l3.append(word)
		else:
			l3[-1].val.append(word)
	return l3


def recognize(words):
	op = {"prefix": [], "nuclear": [], "suffix": [], "cyclic": False}

	nuclindex = 69420
	nucl = None

	# Summary of functioning:
	# Finds the first Word that is 1. eligible to be the nucleus, and
	# 2. is at no point superceded by the word 'yl(idene)',
	# and takes that Word to be the nucleus
	for i in range(len(words)):
		word = words[i]

		if word.val in wordlist(["nuclear"]) and not nucl:
			nucl = word
			nuclindex = i
		if word.val in ("yl", "ylidene"):
			nucl = None
			nuclindex = 69420

	# Now that index of nucleus has been determined,
	# Split up
	op["prefix"] = words[:nuclindex]
	op["nuclear"] = words[nuclindex]
	op["suffix"] = words[nuclindex+1:]

	if words[nuclindex - 1].val == 'cyclo':
		op['cyclic'] = 1

	return op


def get_properties(dat):
	output = {
		'props': [],
		'cyclic': dat['cyclic']
	}

	prefixes = dat['prefix']

	locant = [0]
	prop: Word = None
	for i in range(len(prefixes)):
		word = prefixes[i]

		if word.val.isdigit():
			if prefixes[i - 1].val == ',':
				locant.append(int(word.val))
			else:
				locant[-1] *= 10
				locant[-1] += int(word.val)

	pprint(dat)
	return output

In [38]:
# Test case
pprint(get_properties(recognize(categorize(splitWords("2,5-dimethylhexane")))))

{'cyclic': False,
 'nuclear': hex (common),
 'prefix': [2 (ignore),
            , (ignore),
            5 (ignore),
            - (trueignore),
            di (ignore),
            meth (nuclear),
            yl (prefix)],
 'suffix': [ane (trueignore)]}
{'cyclic': False, 'props': []}
