In [70]:
import numpy as np
import pandas as pd
from groq import Groq
import os
import time

In [71]:
response = pd.read_excel('../../dataframes/llama3-70B-OMIEC_RESPONSES.xlsx')
raw_data =  pd.read_excel('../../data/raw/OMIEC_07_08_24.xls')

In [491]:
result = pd.concat([response, raw_data], axis=1)
result.drop('index', axis=1, inplace=True)

In [492]:
def prepare_corpus(df):
    corpus = []
    for index, row in df.iterrows():
        for token in set(str(row['response']).replace('\n', ';').split(';')):
            corpus.append(token.strip().replace('/', "-").replace(":", "-"))
            
    return pd.Series(corpus)

In [493]:
corpus = prepare_corpus(result)
corpus = corpus[corpus != 'nan']
corpus = corpus[corpus != '']
corpus = corpus.reset_index(drop=True)

In [494]:
len(corpus)

622

In [495]:
np.unique(corpus[corpus.str.contains(r'PEDOT', regex=True)].values.tolist())

array(['EG-GOPS-PEDOT-PSS',
       'Glycolated Poly(3,4-ethylenedioxythiophene) (PEDOT)', 'PEDOT',
       'PEDOT- PSS', 'PEDOT-Cl', 'PEDOT-EDOTCOOH', 'PEDOT-EDOTS',
       'PEDOT-PBA', 'PEDOT-PSS', 'PEDOT-PolyDADMA FSI',
       'PEDOT-PolyDADMA TFSI', 'PEDOT-PolyDADMA Tos', 'PEDOT-SBPH',
       'PEDOT-b-PEG', 'PEDOT-tos', 'PEO-PEDOT', 'PEO-PEDOT-KOTf',
       'Poly(3,4-ethylene-dioxythiophene) (PEDOT)',
       'Poly(3,4-ethylenedioxythiophene) (PEDOT)',
       'Poly(3,4-ethylenedioxythiophene)-poly(4-styrenesulfonate) (PEDOT-PSS)',
       'Poly(3,4-ethylenedioxythiophene)-poly(styrenesulfonate) (PEDOT-PSS)',
       'poly(3,4-ethylenedioxythiophene) (PEDOT)', 'poly(PEDOT)'],
      dtype='<U69')

In [496]:
replacements = {
    r'.*\s\(([^)]+)\)$': r'\1', # para polímeros que estão no formato 'nome_polímero (abreviação_polímero)', substitui pela abreviação apenas
    r'[Pp]oly\(3,4-ethylenedioxythiophene\)': 'PEDOT',
    r'\b[Pp]EDOT[-\s]?[Pp]oly\(?[0-9]*[-\s]*styrenesulfonate\)?\b': 'PEDOT-PSS',
    r'\b[Pp]EDOT\s*[-\s]*PSS\)?': 'PEDOT-PSS',
    r'\bPEDOT-PSS\)$': 'PEDOT-PSS',
    r'[Pp]olypyrrole': 'PPy',
    r'[Pp]olyethylenedioxythiophene': 'PEDOT',
    r'[Pp]olythiophene[s]?': 'PT',  
    r'[Pp]oly\((thiophene)\)': 'PT',
    r'[Pp]oly\(3-methylthiophene\)': 'P3MT',
    r'[Pp]oly\(3-hexylthiophene\)': 'P3HT',
    r'[Pp]oly\(3-hexylthiophene[-\d,]*diyl\)': 'P3HT'

}

items_to_remove = [r'(?i)\bmixed\b',
                   r'(?i)\bnone\b',
                   r'OMIEC.*',
                   'based', 
                   r'[Pp]olymer',
                   r'OECT.*',
                   'conjug'
                   ]


In [497]:
for pattern, replacement in replacements.items():
    corpus = corpus.str.replace(pattern, replacement, regex=True, case=False)
    
for item in items_to_remove:
    corpus = corpus[~corpus.str.contains(item, regex=True)]

In [498]:
len(corpus)

529

In [499]:
len(corpus.unique())

357

In [500]:
corpus.value_counts().head(50)

PEDOT-PSS                                        68
PEDOT                                            32
P3HT                                             20
PEO                                               8
PT                                                7
TTF-CA                                            6
PPy                                               6
PSS                                               5
PPV                                               4
PANI                                              4
BBL                                               3
P3MT                                              3
PAN                                               3
p(g2T-T)                                          2
Polyaniline                                       2
Homo-gDPP                                         2
MEH-PPV                                           2
IDTBT                                             2
PEO-PVP                                           2
PSSNa       

In [502]:
np.unique(corpus[corpus.str.contains(r'PEDOT', regex=True)].values.tolist())

array(['EG-GOPS-PEDOT-PSS', 'PEDOT', 'PEDOT-Cl', 'PEDOT-EDOTCOOH',
       'PEDOT-EDOTS', 'PEDOT-PBA', 'PEDOT-PSS', 'PEDOT-PolyDADMA FSI',
       'PEDOT-PolyDADMA TFSI', 'PEDOT-PolyDADMA Tos', 'PEDOT-SBPH',
       'PEDOT-b-PEG', 'PEDOT-tos', 'PEO-PEDOT', 'PEO-PEDOT-KOTf',
       'poly(PEDOT)'], dtype='<U20')