## Mass Balance Tool

In [64]:
# Install a conda package in the current Jupyter kernel
import sys
!conda install --yes --prefix {sys.prefix} periodictable
!conda install --yes --prefix {sys.prefix} pubchempy

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [65]:
import periodictable as pt
import pubchempy as pcp
import numpy as np
import pandas as pd
import re

In [66]:
# raw_EDS_data is a string with the full path to the csv containing the raw data from EDS
raw_EDS_data_path = '/Users/ec18006/github_repos/mass-balance-tool/Raw-EDS-Data.csv'

In [67]:
# Quick check of what the raw EDS data looks like
raw_EDS_df = pd.read_csv(raw_EDS_data_path, header=1)
print(raw_EDS_df)

        001 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6
0   Formula      mass%      Atom%      Sigma        Net    K ratio       Line
1         C       7.66      56.28       0.04      11738  0.0602071          K
2         O       5.42      29.93       0.04       8068  0.0455136          K
3        Al       0.18        0.6       0.01        896  0.0009698          K
4        Si       0.27       0.84       0.01       1480  0.0017827          K
..      ...        ...        ...        ...        ...        ...        ...
69       Cl       0.21       0.32       0.01       1317  0.0021795          K
70      Cu*       0.07       0.06       0.01         89  0.0005901          K
71       Sn       3.91       1.78       0.03      12078  0.0318764          L
72       Ba       2.82       1.11       0.03       6560   0.023085          L
73    Total      31.45        100        NaN        NaN        NaN        NaN

[74 rows x 7 columns]


In [68]:
def process_raw_data(raw_data_path):
    raw_df = pd.read_csv(raw_data_path, header=1)

    dfs = np.split(raw_df, np.flatnonzero(raw_df['001'] == 'Formula')[1:])

    new_dfs= []
    for df in dfs:
        headers = df.iloc[0]
        new_df  = pd.DataFrame(df.values[1:], columns=headers)
        drop_index = int(np.flatnonzero(new_df['Formula'] == 'Total')[0:]) + 1

        if len(new_df) > drop_index:
            rows_to_drop = len(new_df) - drop_index
            for i in range(rows_to_drop):
                new_df.drop(new_df.index.to_list()[drop_index], inplace=True)
        new_dfs.append(new_df)

    return new_dfs

In [69]:
new_dfs = process_raw_data(raw_EDS_data_path)
print(new_dfs[3])

# asterisk means dad added this element manually. Add column saying whether element was automatically detected or not

37 Formula  mass%  Atom% Sigma    Net    K ratio Line
0        C   6.73  54.65  0.03  12191  0.0625313    K
1        O   5.16  31.48  0.03   8539  0.0481695    K
2      Al*   0.03   0.11  0.01    153  0.0001658    K
3       Si   0.13   0.44  0.01    711  0.0008563    K
4        S   0.19   0.57  0.01   1189  0.0018717    K
5      Cl*   0.06   0.16  0.01    388  0.0006423    K
6      Fe*   0.02   0.04  0.01     51  0.0002082    K
7      Cu*   0.14   0.21  0.02    188  0.0012458    K
8       Sn  14.18  11.65  0.06  46574  0.1229145    L
9      Ba*   0.95   0.68  0.03   2088  0.0073491    L
10   Total  27.59    100   NaN    NaN        NaN  NaN


In [70]:
def interpret_chemical_formula(formula):
    """ Takes as input a string corresponding to a chemical formula. Returns a list of the elements that chemical formula is comprised of. 
    """
    # element_name is: capital letter followed by optional lower-case
    # count is: empty string (so the count is 1), or a set of digits
    element_pat = re.compile("([A-Z][a-z]?)(\d*)")

    all_elements = []
    for (element_name, count) in element_pat.findall(formula):
        if count == "":
            count = 1
        else:
            count = int(count)
        all_elements.extend([element_name] * count)

    return all_elements

In [76]:
# Compound database

compound_list = ["CO2", "C2H4", "CH4", "Al2O3"]

# Deconstruct compounds into atoms 
def calculate_total_mass(compound):
    atom_list = interpret_chemical_formula(compound)
    
    total_mass = 0
    for el in atom_list:
        mass = pt.elements.symbol(el).mass
        total_mass += mass
    
    return total_mass

for compound in compound_list:
    mass = calculate_total_mass(compound)
    print(mass)

44.0095
28.053160000000005
16.04246
101.96127599999998


In [77]:
# Testing PubChemPy database
df1 = pcp.get_compounds('C20H41Br', 'formula', as_dataframe=True)    
df2 = pcp.get_substances([1, 2, 3, 4], as_dataframe=True)
p = pcp.get_properties('IsomericSMILES', 'CC', 'smiles', searchtype='superstructure')

In [73]:
# Have cell that checks that atom% always adds to 100. Throw error if not

# Have cell that is able to subtract spectrum from reference 

# Figure out best database to use. PubChem? https://pubchempy.readthedocs.io/en/latest/
# Pull out compounds that matter from database and create sub-database