# Bibliotecas

In [1]:
import pandas as pd

# Carregando os dados

In [2]:
df_sentilex = pd.read_csv("../data/SentiLex-lem-PT02.csv", sep = ";") # carrega o dataset

In [3]:
df_sentilex.head() # ve as primeiras linhas

Unnamed: 0,word.part_of_speech,target,polarity_n0,polarity_n1,anotation
0,à-vontade.PoS=N,TG=HUM:N0,POL:N0=1,ANOT=MAN,
1,abafado.PoS=Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,
2,abafante.PoS=Adj,TG=HUM:N0,POL:N0=-1,ANOT=MAN,
3,abaixado.PoS=Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,
4,abalado.PoS=Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,


# Separando as palavras das classes gramáticais

In [4]:
df_sentilex_sep = df_sentilex.copy() # faz um cópia do dataset original
df_sentilex_sep[['word', 'part_of_speech']] = df_sentilex['word.part_of_speech'].str.split(r'\.PoS=', expand=True) # separa as palavras das classes
df_sentilex_sep = df_sentilex_sep.drop(columns=['word.part_of_speech']) # remove a coluna que sobrou

In [5]:
df_sentilex_sep.head()

Unnamed: 0,target,polarity_n0,polarity_n1,anotation,word,part_of_speech
0,TG=HUM:N0,POL:N0=1,ANOT=MAN,,à-vontade,N
1,TG=HUM:N0,POL:N0=-1,ANOT=JALC,,abafado,Adj
2,TG=HUM:N0,POL:N0=-1,ANOT=MAN,,abafante,Adj
3,TG=HUM:N0,POL:N0=-1,ANOT=JALC,,abaixado,Adj
4,TG=HUM:N0,POL:N0=-1,ANOT=JALC,,abalado,Adj


# Reorganizando as colunas

In [6]:
df_sentilex_reordered = df_sentilex_sep.iloc[:, [4, 5, 0, 1, 2, 3]] # reorganiza as colunas

In [7]:
df_sentilex_reordered.head()

Unnamed: 0,word,part_of_speech,target,polarity_n0,polarity_n1,anotation
0,à-vontade,N,TG=HUM:N0,POL:N0=1,ANOT=MAN,
1,abafado,Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,
2,abafante,Adj,TG=HUM:N0,POL:N0=-1,ANOT=MAN,
3,abaixado,Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,
4,abalado,Adj,TG=HUM:N0,POL:N0=-1,ANOT=JALC,


# Movendo as anotações para a coluna certa

In [8]:
df_cor_anotation = df_sentilex_reordered.copy()

In [9]:
for i, line in df_cor_anotation['polarity_n1'].items():
    if isinstance(line, str) and "ANOT=" in line: # toda valor que contiver "ANOT=" em polarity_n1 é movida para anotation
        df_cor_anotation.at[i, 'anotation'] = line
        df_cor_anotation.at[i, 'polarity_n1'] = None

In [10]:
df_cor_anotation.head()

Unnamed: 0,word,part_of_speech,target,polarity_n0,polarity_n1,anotation
0,à-vontade,N,TG=HUM:N0,POL:N0=1,,ANOT=MAN
1,abafado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC
2,abafante,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=MAN
3,abaixado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC
4,abalado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC


# Verificando polaridades n1 na coulna n0

In [11]:
for i, line in df_cor_anotation['polarity_n0'].items():
     if isinstance(line, str) and "POL:N1" in line:
            print(i,": " + line) # existem alguns valores da polarity_n1 na polarity_n0

5092 : POL:N1=-1
6185 : POL:N1=-1
6295 : POL:N1=-1
6304 : POL:N1=-1
6305 : POL:N1=-1
6592 : POL:N1=-1


# Movendo polaridades n1 da coluna n0 para a coluna n1

In [12]:
for i, line in df_cor_anotation['polarity_n0'].items():
    if isinstance(line, str) and "POL:N1" in line:
        df_cor_anotation.at[i, 'polarity_n1'] = line
        df_cor_anotation.at[i, 'polarity_n0'] = None

# Verificando quais valores se tornaram None

In [13]:
for i, line in df_cor_anotation['polarity_n0'].items():
     if line is None:
         print(i) # os valores foram movidos corretamente

5092
6185
6295
6304
6305
6592


In [14]:
df_cor_anotation.head()

Unnamed: 0,word,part_of_speech,target,polarity_n0,polarity_n1,anotation
0,à-vontade,N,TG=HUM:N0,POL:N0=1,,ANOT=MAN
1,abafado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC
2,abafante,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=MAN
3,abaixado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC
4,abalado,Adj,TG=HUM:N0,POL:N0=-1,,ANOT=JALC


# Removendo as colunas desnecessárias

In [15]:
df_cleared = df_cor_anotation.copy()
df_cleared = df_cleared.drop(columns=['part_of_speech','target','anotation']) # remove as colunas 'part_of_speech','target' e 'anotation'

In [16]:
df_cleared.head()

Unnamed: 0,word,polarity_n0,polarity_n1
0,à-vontade,POL:N0=1,
1,abafado,POL:N0=-1,
2,abafante,POL:N0=-1,
3,abaixado,POL:N0=-1,
4,abalado,POL:N0=-1,


# Removendo texto desnecessário

In [17]:
df_cleared['polarity_n0'] = df_cleared['polarity_n0'].str.replace(r'^POL:N0=', '', regex=True) # remove o POL:N0= dos valores

In [18]:
df_cleared.head()

Unnamed: 0,word,polarity_n0,polarity_n1
0,à-vontade,1,
1,abafado,-1,
2,abafante,-1,
3,abaixado,-1,
4,abalado,-1,


In [19]:
df_cleared['polarity_n1'] = df_cleared['polarity_n1'].str.replace(r'^POL:N1=', '', regex=True) # remove o POL:N1= dos valores

In [20]:
df_cleared.head()

Unnamed: 0,word,polarity_n0,polarity_n1
0,à-vontade,1,
1,abafado,-1,
2,abafante,-1,
3,abaixado,-1,
4,abalado,-1,


# Verificando inconsistências

In [21]:
df_cleared.info() # verificando os tipos de dados

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7014 entries, 0 to 7013
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   word         7014 non-null   object
 1   polarity_n0  7008 non-null   object
 2   polarity_n1  462 non-null    object
dtypes: object(3)
memory usage: 164.5+ KB


In [22]:
df_cleared['word'] = df_cleared['word'].astype('string') # convertendo "word" para string

In [23]:
df_cleared['polarity_n0'] = df_cleared['polarity_n0'].astype("Int64") # convertendo "polarity_n0" para Int64, 
                                                                      # com o objetivo de possíbilitar a 
                                                                      # verificação de valores numericos

In [24]:
df_cleared['polarity_n1'] = df_cleared['polarity_n1'].astype("Int64") # convertendo "polarity_n1" para Int64

In [25]:
values_not_onezerominusone_n0 = df_cleared[(df_cleared['polarity_n0'] > 1) | (df_cleared['polarity_n0'] < -1)] 
# verifica se existem valores maiores que 1 ou menores que -1 (fora do escopo da polaridade)

In [26]:
values_not_onezerominusone_n0.head() # existem 4 valores em polarity_n0 que fogem do escopo 
                                     # (provavelmente erro de digitação ou algo assim, já que 
                                     # no artigo não é dito nada sobre valores além de 1, 0 e -1)

Unnamed: 0,word,polarity_n0,polarity_n1
4318,intrépido e sereno,8,
4319,intrépido,7,
5602,ponto fraco,-2,
5603,ponto fraco,-3,


In [27]:
values_not_onezerominusone_n1 = df_cleared[(df_cleared['polarity_n1'] > 1) | (df_cleared['polarity_n1'] < -1)] 
# verificar coluna polarity_n1 também

In [28]:
values_not_onezerominusone_n1.head() # não possui nenhum valor fora do escopo

Unnamed: 0,word,polarity_n0,polarity_n1


# Corrigindo inconsistências

In [29]:
df_cleared['polarity_n0'] = df_cleared['polarity_n0'].clip(lower=-1, upper=1) 
df_cleared['polarity_n1'] = df_cleared['polarity_n1'].clip(lower=-1, upper=1)
# substitui valores maiores que 1 para 1 e menores que -1 para -1

In [30]:
values_not_onezerominusone_n0 = df_cleared[(df_cleared['polarity_n0'] > 1) | (df_cleared['polarity_n0'] < -1)] # verificando novamente

In [31]:
values_not_onezerominusone_n0.head() # nenhum valor fora do escopo

Unnamed: 0,word,polarity_n0,polarity_n1


In [32]:
values_not_onezerominusone_n1 = df_cleared[(df_cleared['polarity_n1'] > 1) | (df_cleared['polarity_n1'] < -1)] 

In [33]:
values_not_onezerominusone_n1.head() # nenhum valor fora do escopo

Unnamed: 0,word,polarity_n0,polarity_n1


# Salvando os dados como CSV

In [34]:
df_cleared.to_csv("../data/sentilex_v1.0.0.csv", index=False, encoding="utf-8", sep = ";") # cria um csv