Code for expanding the lexicon

In [42]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
from xml.dom.minidom import Document


In [43]:
import csv

def convert_text_to_csv(input_file, output_file):
    # Open the input text file for reading
    with open(input_file, 'r', encoding='utf-8') as f:
        # Skip lines until the first line starting with underscore
        for line in f:
            if line.startswith('_'):
                break
        
        # Initialize a list to store the data
        data = []
        
        # Read the remaining lines
        for line in f:
            # Split the line by tabs and remove leading/trailing whitespace
            columns = line.strip().split('\t')
            
            # Append the columns to the data list
            data.append(columns)

    # Open the output CSV file for writing
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        # Create a CSV writer object
        writer = csv.writer(f)
        
        # Write the data to the CSV file
        writer.writerows(data)

    print("Conversion completed. CSV file saved as", output_file)

# Example usage:
input_file_path = 'saldo20v03.txt'
output_file_path = 'saldo20v03CSV.csv'
convert_text_to_csv(input_file_path, output_file_path)

Conversion completed. CSV file saved as saldo20v03CSV.csv


In [44]:
import csv

# Path to the SenSALDO text file (as uploaded)
sensaldo_path = 'sensaldo-base-v02.txt'

# Path to the output CSV file
sensaldo_csv_path = 'sensaldo.csv'

# Open the SenSALDO file and create a CSV file for output
with open(sensaldo_path, 'r', encoding='utf-8') as infile, \
     open(sensaldo_csv_path, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    
    # Write the headers for the CSV file
    headers = ['SALDO sense ID', 'polarity label']
    writer.writerow(headers)
    
    # Read the SenSALDO file line by line
    for line in infile:
        # Skip empty lines and lines starting with '#' (comments)
        if not line.strip() or line.startswith('#'):
            continue
        
        # Split the line into components based on tabs
        sense_id, polarity = line.strip().split('\t')
        
        # Write the components to the CSV file
        writer.writerow([sense_id, polarity])

print(f"Conversion complete. The CSV file is saved at {sensaldo_csv_path}.")

Conversion complete. The CSV file is saved at sensaldo.csv.


Expansion

In [45]:
saldoDf = pd.read_csv('saldo20v03CSV.csv', names=['betydelse', 'prim', 'sek', 'lemma', 'ord', 'ordklass', 'deskr' ], encoding='utf-8')
sensaldoDf = pd.read_csv('sensaldo.csv', encoding='utf-8')

In [46]:
saldoDfC = saldoDf.copy().drop_duplicates(subset=['betydelse'])
sensaldoDfC = sensaldoDf.copy().drop_duplicates(subset=['SALDO sense ID'])

In [47]:
sensaldoDfC.head()

Unnamed: 0,SALDO sense ID,polarity label
0,absolut..1,0
1,absolution..1,1
2,abstinens..1,-1
3,abstinensbesvär..1,-1
4,absurd..1,-1


In [48]:
saldoDf.head()

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr
0,_1984..1,roman..1,dystopi..1,nittonhundraåttiofyra..pm.1,nittonhundraåttiofyra,pm,pm_uwb_hemsöborna
1,_1984..1,roman..1,dystopi..1,Nittonhundraåttiofyra..pm.1,Nittonhundraåttiofyra,pm,pm_uwb_hemsöborna
2,a..1,bokstav..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc
3,A..1,initial..1,PRIM..1,A..nn.1,A,nn,nn_vn_alfa_abc
4,à..1,per..1,PRIM..1,á..pp.1,á,pp,pp_i_i


In [49]:
saldoDfC.loc[saldoDfC['betydelse']=='abborre..1']



Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr
27,abborre..1,fisk..1,PRIM..1,abborre..nn.1,abborre,nn,nn_2u_vinge


In [50]:
saldoDfC

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr
0,_1984..1,roman..1,dystopi..1,nittonhundraåttiofyra..pm.1,nittonhundraåttiofyra,pm,pm_uwb_hemsöborna
2,a..1,bokstav..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc
3,A..1,initial..1,PRIM..1,A..nn.1,A,nn,nn_vn_alfa_abc
4,à..1,per..1,PRIM..1,á..pp.1,á,pp,pp_i_i
6,a..2,ton..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc
...,...,...,...,...,...,...,...
137124,övre_medelklass..1,medelklass..1,PRIM..1,övre_medelklass..nnm.1,övre medelklass,nnm,nnm_0u0_frid
137125,Övre_Volta..1,Burkina_Faso..1,PRIM..1,Övre_Volta..pmm.1,Övre Volta,pmm,pmm_n0lp_sri_lanka
137126,övrig..1,annan..1,PRIM..1,övrig..pn.1,övrig,pn,pn_o_all
137127,övärld..1,ögrupp..1,PRIM..1,övärld..nn.1,övärld,nn,nn_2u_mening


In [51]:
# Assuming "betydelse" is the common column name in saldoDFC and "SALDO_sense_ID" in sensaldoDFC

# Perform a left join
merged_df = saldoDfC.merge(sensaldoDfC, left_on="betydelse", right_on="SALDO sense ID", how="left", indicator=True)

# Filter out the rows where "betydelse" is not found in "SALDO_sense_ID"
result_df = merged_df[merged_df["_merge"] == "left_only"]

# Drop the indicator column
result_df = result_df.drop(columns="_merge")

# Now result_df contains all the rows where "betydelse" is not found in "SALDO_sense_ID"

In [52]:
def polarityChecker(word, saldoD, sensaldoD):
    wRow = saldoD[saldoD['betydelse'] == word]
    #print(wRow)
    primVal = wRow['prim'].iloc[0]
    #print(primVal)
    if(primVal != 'PRIM..1'):
        mask = sensaldoD['SALDO sense ID'] == primVal
        if (mask).any():
            senRow = sensaldoD[mask].iloc[0]
            #updatedSaldoD = saldoD.copy()
            #updatedSaldoD.loc[updatedSaldoD['betydelse'] == word, 'polarity label'] = senRow['polarity label']
            return senRow['polarity label']
        else:
            return (polarityChecker(primVal, saldoD, sensaldoD))
            #print("value not found", word, primVal)
            #return saldoD
    else:
        sekVal = wRow['sek'].iloc[0]
        if(sekVal != 'PRIM..1'):
            mask = sensaldoD['SALDO sense ID'] == sekVal
            if (mask).any():
                senRow = sensaldoD[mask].iloc[0]
                #updatedSaldoD.loc[updatedSaldoD['betydelse'] == word, 'polarity label'] = senRow['polarity label']
                return senRow['polarity label']
            else:
                #print("value not found", word, primVal)
                return "NA"
        else:
            #print("Value not found att all", word, primVal, sekVal)
            return "NA"


In [53]:
def addingChecker(word, saldoD, sensaldoD):
    #print(word)
    polVal = polarityChecker(word, saldoD, sensaldoD)
    #updatedSensaldoD = sensaldoD.copy()
    if(polVal != "NA"):
        #print(polVal)
        newRow = {'SALDO sense ID': word, 'polarity label': polVal}
        sensaldoD = pd.concat([sensaldoD, pd.DataFrame([newRow])], ignore_index=True)
    return sensaldoD

In [54]:
newAdded = addingChecker("abakus..1", saldoDf, sensaldoDf)
newAdded[newAdded['SALDO sense ID'] == 'abakus..1']
newAdded

Unnamed: 0,SALDO sense ID,polarity label
0,absolut..1,0
1,absolution..1,1
2,abstinens..1,-1
3,abstinensbesvär..1,-1
4,absurd..1,-1
...,...,...
12283,överväldiga..1,1
12284,överväldigande..1,0
12285,överväldigande..2,0
12286,överväldigande..3,1


In [55]:
newAdded2 = addingChecker("adventsstjärna..1", saldoDf, sensaldoDf)
#print(newAdded2)
newAdded2[newAdded2['SALDO sense ID'] == 'adventsstjärna..1']
#newAdded2

Unnamed: 0,SALDO sense ID,polarity label
12287,adventsstjärna..1,1


In [56]:
sensaldoDfC['polarity label']

0        0
1        1
2       -1
3       -1
4       -1
        ..
12282    0
12283    1
12284    0
12285    0
12286    1
Name: polarity label, Length: 12287, dtype: int64

In [57]:
# Check if the value exists in the column 'SALDO sense id'
mask = sensaldoDfC['SALDO sense ID'] == 'räkna..1'

# If the value exists, retrieve the corresponding row
if mask.any():
    row = sensaldoDfC[mask].iloc[0]
    # 'row' now contains the first row where the value exists in the column 'SALDO sense id'
    print(row)
    print(row['polarity label'])
else:
    print("Value not found in DataFrame.")

SALDO sense ID    räkna..1
polarity label           0
Name: 8122, dtype: object
0


In [58]:
saldoDfC[saldoDfC['betydelse'] == 'abborre..1']

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr
27,abborre..1,fisk..1,PRIM..1,abborre..nn.1,abborre,nn,nn_2u_vinge


Only the wordclasses we want

In [59]:
rensad = result_df.copy()

rensad = rensad[rensad['ordklass'].isin(['nn', 'vb', 'av'])]
rensad

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr,SALDO sense ID,polarity label
1,a..1,bokstav..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc,,
2,A..1,initial..1,PRIM..1,A..nn.1,A,nn,nn_vn_alfa_abc,,
4,a..2,ton..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc,,
12,abakus..1,räkna..1,PRIM..1,abakus..nn.1,abakus,nn,nn_3u_karbid,,
13,abalone..1,havsöra..1,PRIM..1,abalone..nn.1,abalone,nn,nn_4u_linje,,
...,...,...,...,...,...,...,...,...,...
131010,övningsskjutning..1,skjutning..1,övning..1,övningsskjutning..nn.1,övningsskjutning,nn,nn_2u_mening,,
131011,övningsskytte..1,övningsskjutning..1,PRIM..1,övningsskytte..nn.1,övningsskytte,nn,nn_0n_syre,,
131012,övningsämne..1,skolämne..1,praktisk..1,övningsämne..nn.1,övningsämne,nn,nn_5n_dike,,
131013,övre..1,uppe..1,PRIM..1,övre..av.1,övre,av,av_2k_bakre,,


In [60]:
rensadTest = result_df.copy()

rensadTest = rensadTest[rensadTest['ordklass'].isin(['nn', 'vb', 'av'])]
rensadTest = rensadTest.head(10)
rensadTest

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr,SALDO sense ID,polarity label
1,a..1,bokstav..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc,,
2,A..1,initial..1,PRIM..1,A..nn.1,A,nn,nn_vn_alfa_abc,,
4,a..2,ton..1,PRIM..1,a..nn.1,a,nn,nn_vn_alfa_abc,,
12,abakus..1,räkna..1,PRIM..1,abakus..nn.1,abakus,nn,nn_3u_karbid,,
13,abalone..1,havsöra..1,PRIM..1,abalone..nn.1,abalone,nn,nn_4u_linje,,
14,abandon..1,otvungen..1,PRIM..1,abandon..nn.1,abandon,nn,nn_0u_månsing,,
19,abbé..1,abbot..1,PRIM..1,abbé..nn.1,abbé,nn,nn_3u_akademi,,
21,abbedissa..1,kloster..1,PRIM..1,abbedissa..nn.1,abbedissa,nn,nn_1u_olja,,
22,abborre..1,fisk..1,PRIM..1,abborre..nn.1,abborre,nn,nn_2u_vinge,,
23,abborrfisk..1,abborrliknande..1,PRIM..1,abborrfisk..nn.1,abborrfisk,nn,nn_2u_sten,,


In [61]:
def updateDataFrameWithWords(words, saldoD, sensaldoD):
    expandedSensaldoD = sensaldoD.copy()
    for word in words:
        expandedSensaldoD = addingChecker(word, saldoD, expandedSensaldoD)
    return expandedSensaldoD

# Assuming `rensad` DataFrame contains words in the column 'betydelse'
words_to_add = rensad['betydelse']

# Update the DataFrame with the words from `rensad`
expandedSensaldoDF = updateDataFrameWithWords(words_to_add, saldoDf, sensaldoDf)

In [62]:
expandedPath = "expandedSenSaldoLexicon.csv"
expandedSensaldoDF.to_csv(expandedPath, index=False)

In [63]:
hj = result_df[result_df['prim'] == "PRIM..1"]
hj[hj['sek'] != "PRIM..1"]

Unnamed: 0,betydelse,prim,sek,lemma,ord,ordklass,deskr,SALDO sense ID,polarity label


In [64]:
# Sort the DataFrame by 'SALDO sense ID' column
loadedExpandedSensaldoDF = pd.read_csv('expandedSenSaldoLexicon.csv', encoding='utf-8')
sorted_df = loadedExpandedSensaldoDF.sort_values(by='SALDO sense ID')

# Save the sorted DataFrame to a CSV file
sorted_df.to_csv('expandedSensaldoSorted.csv', index=False)