In [102]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
from xml.dom.minidom import Document


Converting the data file to xml and then the annotated file back into csv with all the columns

In [103]:

def csv_to_xmlFile(csv_file, xml_file):
    # Create a new XML document
    doc = Document()
    
    # XML declaration is added automatically with minidom
    # Create the root element
    root = doc.createElement('root')
    doc.appendChild(root)
    
    # Open and read the CSV file
    with open(csv_file, 'r', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        
        for row in csvreader:
            if len(row) < 3:
                continue  # Skip rows with less than 3 columns
            
            middle_column_text = row[1]
            
            # Create a 'row' element for each row in the CSV
            row_element = doc.createElement('row')
            root.appendChild(row_element)
            
            # Assuming the dynamic tag name and value from the middle column is desired
            custom_tag = doc.createElement(middle_column_text)
            custom_tag_text = doc.createTextNode(middle_column_text)
            custom_tag.appendChild(custom_tag_text)
            
            row_element.appendChild(custom_tag)
    
    # Generate the formatted XML string
    xml_str = doc.toprettyxml(indent="    ")  # Adjust indent as necessary
    
    # Write the XML to a file
    with open(xml_file, 'w', encoding='utf-8') as xmlf:
        xmlf.write(xml_str)

# Specify the CSV and XML file paths
csv_file_path = 'testtestorg.csv'
xml_file_path = 'testtestorgXML.xml'

# Generate the XML
csv_to_xmlFile(csv_file_path, xml_file_path)

print(f"XML file generated at: {xml_file_path}")

XML file generated at: testtestorgXML.xml


In [104]:

def xml_to_csvFile(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    data = []
    paragrafnummer = 1  # Initialize XML structure
    
    for paragraph in root.findall('.//paragraph'):
        for sentence in paragraph.findall('.//sentence'):
            for word in sentence.findall('.//w'):
                data.append({
                    "paragrafnummer": paragrafnummer,
                    "token": word.text,
                    "msd": word.get('msd', ''),
                    "lemma": word.get('lemma', ''),
                    "lex": word.get('lex', ''),
                    "sense": word.get('sense', ''),
                    "complemgram": word.get('complemgram', ''),
                    "compwf": word.get('compwf', ''),
                    "sentimentclass": word.get('sentimentclass', ''),
                    "ref": word.get('ref', ''),
                    "dephead": word.get('dephead', ''),
                    "deprel": word.get('deprel', '')
                })
        paragrafnummer += 1  # Increment for each paragraph
    
    df = pd.DataFrame(data)
    df['dephead'] = df['dephead'].replace("", 0)
    # Convert sentimentclass values to numerical values
    sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1, "": ""}
    df['sentimentclass'] = df['sentimentclass'].map(sentiment_mapping)

    csv_path = xml_path.replace('.xml', '.csv')  # Or specify your own path
    df.to_csv(csv_path, index=False)
    return csv_path



# Use the function
xml_path = 'konvertR.xml'
csv_path = xml_to_csvFile(xml_path)
print(f"CSV file created at: {csv_path}")


CSV file created at: konvertR.csv


Loads and checks how the annotated data looks like

In [105]:
testtestdata_df= pd.read_csv(csv_path, encoding='utf-8')
testtestorg_df=pd.read_csv('testtestorg.csv', header=None, names=['sentimentannotation', 'sentence', 'category'], encoding='utf-8')
testtestdata_df

Unnamed: 0,paragrafnummer,token,msd,lemma,lex,sense,complemgram,compwf,sentimentclass,ref,dephead,deprel
0,1,Vårens,NN.UTR.SIN.DEF.GEN,|vår|,|vår..nn.1|,|vår..1:-1.000|,|,|,0.0,1,3,DT
1,1,första,RO.NOM,|,|,|,|,|,,2,3,DT
2,1,blommor,NN.UTR.PLU.IND.NOM,|blomma|,|blomma..nn.1|,|blomma..1:-1.000|,|blom..nn.1+mor..nn.1:1.816e-11|blomma..nn.1+m...,|blom+mor|,0.0,3,4,SS
3,1,har,VB.PRS.AKT,|ha|,|ha..vb.1|,|ha..3:0.504|ha..1:0.496|,|,|,0.0,4,0,ROOT
4,1,börjat,VB.SUP.AKT,|börja|,|börja..vb.1|börja..vb.2|,|börja..2:0.515|börja..1:0.485|,|,|,0.0,5,4,VG
...,...,...,...,...,...,...,...,...,...,...,...,...
1847,172,är,VB.PRS.AKT,|vara|,|vara..vb.1|,|vara..1:-1.000|,|,|,0.0,2,0,ROOT
1848,172,jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,3,2,SS
1849,172,inte,AB,|inte|,|inte..ab.1|,|inte..1:-1.000|,|,|,,4,2,
1850,172,glad,JJ.POS.UTR.SIN.IND.NOM,|glad|,|glad..av.1|,|glad..1:-1.000|,|,|,1.0,5,2,SP


In [106]:
testtestorg_df

Unnamed: 0,sentimentannotation,sentence,category
0,1,Vårens första blommor har börjat blomma i träd...,egna meningar
1,1,Jag blev upplyft av det stöd jag fick från min...,egna meningar
2,1,Den nya kaffebaren i grannskapet serverar utsö...,egna meningar
3,1,Att lyssna på musik på morgonen gör mig glad h...,egna meningar
4,1,Jag är tacksam för de lugna stunderna jag får ...,egna meningar
...,...,...,...
167,0,Linda Haglund sade att hon var oskyldig och he...,8 Sidor
168,0,Ingen vet hur mycket det kommer att kosta att ...,8 Sidor
169,0,Fram till 2010 ska regeringen lägga nästan 8 m...,8 Sidor
170,−1,Jag har inga vänner.,egna meningar


In [107]:
# Add paragrafnummer from testtestorg to testtestdata
testtestdata_df['sentimentval'] = testtestdata_df['paragrafnummer'].apply(
    lambda x: testtestorg_df.at[x-1, 'sentimentannotation'] if x-1 < len(testtestorg_df) else None
)

# Map the category from testtestorg to testtestdata
testtestdata_df['category'] = testtestdata_df['paragrafnummer'].apply(
    lambda x: testtestorg_df.at[x-1, 'category'] if x-1 < len(testtestorg_df) else None
)

# Replace âˆ’ with - in the entire DataFrame
testtestdata_df = testtestdata_df.replace('−', '-', regex=True)

# Save the final corrected DataFrame to a new CSV file
final_csv_path = 'final_corrected_testtestdata.csv'
testtestdata_df.to_csv(final_csv_path, index=False)


In [108]:
df= pd.read_csv('final_corrected_testtestdata.csv', encoding='utf-8')
unique_values = df['sentimentval'].unique()
print(unique_values)
df = df['dephead'].fillna(0)


[ 1  0 -1]


In [109]:
predictDf = pd.read_csv('final_corrected_testtestdata.csv', encoding='utf-8')
predictDf.tail(11)

Unnamed: 0,paragrafnummer,token,msd,lemma,lex,sense,complemgram,compwf,sentimentclass,ref,dephead,deprel,sentimentval,category
1841,171,Jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,1,2,SS,-1,egna meningar
1842,171,har,VB.PRS.AKT,|ha|,|ha..vb.1|,|ha..1:0.745|ha..3:0.255|,|,|,0.0,2,0,ROOT,-1,egna meningar
1843,171,inga,DT.UTR+NEU.PLU.IND,|ingen|,|ingen..pn.1|,|ingen..1:-1.000|,|,|,,3,4,DT,-1,egna meningar
1844,171,vänner,NN.UTR.PLU.IND.NOM,|vän|,|vän..nn.1|,|vän..1:-1.000|,|,|,1.0,4,2,OO,-1,egna meningar
1845,171,.,MAD,|,|,|,|,|,,5,2,IP,-1,egna meningar
1846,172,Idag,AB,|idag|,|idag..ab.1|,|i_dag..1:-1.000|i_dag..2:-1.000|,|,|,,1,2,TA,-1,egna meningar
1847,172,är,VB.PRS.AKT,|vara|,|vara..vb.1|,|vara..1:-1.000|,|,|,0.0,2,0,ROOT,-1,egna meningar
1848,172,jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,3,2,SS,-1,egna meningar
1849,172,inte,AB,|inte|,|inte..ab.1|,|inte..1:-1.000|,|,|,,4,2,,-1,egna meningar
1850,172,glad,JJ.POS.UTR.SIN.IND.NOM,|glad|,|glad..av.1|,|glad..1:-1.000|,|,|,1.0,5,2,SP,-1,egna meningar


In [110]:
def add_average_sentiment_column(df, group_column, value_column, average_column_name='average_sentiment'):
    """
    Add a column to the DataFrame containing the average value of a specified column grouped by another column.
    
    Args:
    - df: DataFrame to be modified.
    - group_column: Name of the column to group by.
    - value_column: Name of the column whose average will be calculated.
    - average_column_name: Name for the new column containing the average values.
    
    Returns:
    - DataFrame with the new column added.
    """
    # Group by group_column and calculate the average of value_column
    average_values = df.groupby(group_column)[value_column].mean().reset_index()
    
    # Merge the average values back into the original DataFrame
    df = df.merge(average_values, on=group_column, suffixes=('', '_avg'))
    
    # Rename the columns for clarity
    df.rename(columns={value_column + '_avg': average_column_name}, inplace=True)
    
    return df

# Example usage:
predictDf = pd.read_csv('final_corrected_testtestdata.csv', encoding='utf-8')
predictDf = add_average_sentiment_column(predictDf, 'paragrafnummer', 'sentimentclass')
predictDf

Unnamed: 0,paragrafnummer,token,msd,lemma,lex,sense,complemgram,compwf,sentimentclass,ref,dephead,deprel,sentimentval,category,average_sentiment
0,1,Vårens,NN.UTR.SIN.DEF.GEN,|vår|,|vår..nn.1|,|vår..1:-1.000|,|,|,0.0,1,3,DT,1,egna meningar,0.0
1,1,första,RO.NOM,|,|,|,|,|,,2,3,DT,1,egna meningar,0.0
2,1,blommor,NN.UTR.PLU.IND.NOM,|blomma|,|blomma..nn.1|,|blomma..1:-1.000|,|blom..nn.1+mor..nn.1:1.816e-11|blomma..nn.1+m...,|blom+mor|,0.0,3,4,SS,1,egna meningar,0.0
3,1,har,VB.PRS.AKT,|ha|,|ha..vb.1|,|ha..3:0.504|ha..1:0.496|,|,|,0.0,4,0,ROOT,1,egna meningar,0.0
4,1,börjat,VB.SUP.AKT,|börja|,|börja..vb.1|börja..vb.2|,|börja..2:0.515|börja..1:0.485|,|,|,0.0,5,4,VG,1,egna meningar,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847,172,är,VB.PRS.AKT,|vara|,|vara..vb.1|,|vara..1:-1.000|,|,|,0.0,2,0,ROOT,-1,egna meningar,0.5
1848,172,jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,3,2,SS,-1,egna meningar,0.5
1849,172,inte,AB,|inte|,|inte..ab.1|,|inte..1:-1.000|,|,|,,4,2,,-1,egna meningar,0.5
1850,172,glad,JJ.POS.UTR.SIN.IND.NOM,|glad|,|glad..av.1|,|glad..1:-1.000|,|,|,1.0,5,2,SP,-1,egna meningar,0.5


In [111]:
lista_negationer = ["inte", "ej", "icke", "ingen", "inget", "inga", "knappast", "aldrig", "ingenting", "ingenstans", "sällan"]

In [112]:
def negation_score_optimized(dataF, negList):
    # Create a copy of the DataFrame to avoid modifying the original
    result = dataF.copy()
    
    # Find indices of negation words
    neg_indices = result[result['token'].isin(negList)].index
    
    # Map each negation word to its dependent sentence or token
    for neg_index in neg_indices:
        dep_head = result.at[neg_index, 'dephead']
        
        # Find all tokens that are dependent on this negation word
        dependent_indices = result[result['ref'] == dep_head].index
        
        # Negate sentiment class for all dependent nodes and their children
        for dependent_index in dependent_indices:
            child_indices = result[result['dephead'] == result.at[dependent_index, 'ref']].index
            # Negate sentiment class for the dependent node
            result.loc[dependent_index, 'sentimentclass'] *= -1
            # Negate sentiment class for all child nodes
            result.loc[child_indices, 'sentimentclass'] *= -1
        
        # Optional: Print statement to indicate operation
        print(f"Found negation word '{result.at[neg_index, 'token']}' in row {neg_index}")

    return result

# Example usage
# negation_score_optimized(testingtest, lista_negationer)
testingtest = pd.read_csv('final_corrected_testtestdata.csv', encoding='utf-8')
testingtest.tail(11)
# negation_score_optimized(testingtest, lista_negationer).tail(11)


Unnamed: 0,paragrafnummer,token,msd,lemma,lex,sense,complemgram,compwf,sentimentclass,ref,dephead,deprel,sentimentval,category
1841,171,Jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,1,2,SS,-1,egna meningar
1842,171,har,VB.PRS.AKT,|ha|,|ha..vb.1|,|ha..1:0.745|ha..3:0.255|,|,|,0.0,2,0,ROOT,-1,egna meningar
1843,171,inga,DT.UTR+NEU.PLU.IND,|ingen|,|ingen..pn.1|,|ingen..1:-1.000|,|,|,,3,4,DT,-1,egna meningar
1844,171,vänner,NN.UTR.PLU.IND.NOM,|vän|,|vän..nn.1|,|vän..1:-1.000|,|,|,1.0,4,2,OO,-1,egna meningar
1845,171,.,MAD,|,|,|,|,|,,5,2,IP,-1,egna meningar
1846,172,Idag,AB,|idag|,|idag..ab.1|,|i_dag..1:-1.000|i_dag..2:-1.000|,|,|,,1,2,TA,-1,egna meningar
1847,172,är,VB.PRS.AKT,|vara|,|vara..vb.1|,|vara..1:-1.000|,|,|,0.0,2,0,ROOT,-1,egna meningar
1848,172,jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,3,2,SS,-1,egna meningar
1849,172,inte,AB,|inte|,|inte..ab.1|,|inte..1:-1.000|,|,|,,4,2,,-1,egna meningar
1850,172,glad,JJ.POS.UTR.SIN.IND.NOM,|glad|,|glad..av.1|,|glad..1:-1.000|,|,|,1.0,5,2,SP,-1,egna meningar


In [113]:
# Apply negation handling logic to each paragraph group in the DataFrame
def apply_negation_by_paragraph(df, negList):
    # Apply the negation score function to each group and combine the results
    result_df = df.groupby('paragrafnummer', group_keys=False).apply(negation_score_optimized, negList=negList)
    return result_df

tttt = apply_negation_by_paragraph(testingtest, lista_negationer)

Found negation word 'inte' in row 212


Found negation word 'inte' in row 265
Found negation word 'inte' in row 301
Found negation word 'ingenting' in row 564
Found negation word 'inte' in row 810
Found negation word 'inte' in row 1068
Found negation word 'inte' in row 1101
Found negation word 'inte' in row 1122
Found negation word 'inte' in row 1149
Found negation word 'aldrig' in row 1230
Found negation word 'inte' in row 1244
Found negation word 'inte' in row 1263
Found negation word 'inte' in row 1278
Found negation word 'inte' in row 1303
Found negation word 'aldrig' in row 1310
Found negation word 'inte' in row 1372
Found negation word 'inte' in row 1501
Found negation word 'inte' in row 1607
Found negation word 'inte' in row 1622
Found negation word 'inte' in row 1638
Found negation word 'ingen' in row 1669
Found negation word 'inte' in row 1699
Found negation word 'inte' in row 1767
Found negation word 'inga' in row 1843
Found negation word 'inte' in row 1849


In [114]:
testingtest.tail(11)

Unnamed: 0,paragrafnummer,token,msd,lemma,lex,sense,complemgram,compwf,sentimentclass,ref,dephead,deprel,sentimentval,category
1841,171,Jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,1,2,SS,-1,egna meningar
1842,171,har,VB.PRS.AKT,|ha|,|ha..vb.1|,|ha..1:0.745|ha..3:0.255|,|,|,0.0,2,0,ROOT,-1,egna meningar
1843,171,inga,DT.UTR+NEU.PLU.IND,|ingen|,|ingen..pn.1|,|ingen..1:-1.000|,|,|,,3,4,DT,-1,egna meningar
1844,171,vänner,NN.UTR.PLU.IND.NOM,|vän|,|vän..nn.1|,|vän..1:-1.000|,|,|,1.0,4,2,OO,-1,egna meningar
1845,171,.,MAD,|,|,|,|,|,,5,2,IP,-1,egna meningar
1846,172,Idag,AB,|idag|,|idag..ab.1|,|i_dag..1:-1.000|i_dag..2:-1.000|,|,|,,1,2,TA,-1,egna meningar
1847,172,är,VB.PRS.AKT,|vara|,|vara..vb.1|,|vara..1:-1.000|,|,|,0.0,2,0,ROOT,-1,egna meningar
1848,172,jag,PN.UTR.SIN.DEF.SUB,|jag|,|jag..pn.1|,|jag..1:-1.000|,|,|,,3,2,SS,-1,egna meningar
1849,172,inte,AB,|inte|,|inte..ab.1|,|inte..1:-1.000|,|,|,,4,2,,-1,egna meningar
1850,172,glad,JJ.POS.UTR.SIN.IND.NOM,|glad|,|glad..av.1|,|glad..1:-1.000|,|,|,1.0,5,2,SP,-1,egna meningar


In [115]:
tttt.tail(11)

In [116]:
#predictDf = pd.read_csv('final_corrected_testtestdata.csv', encoding='utf-8')
ttttPred = add_average_sentiment_column(tttt, 'paragrafnummer', 'sentimentclass')
ttttPred.tail(11)

KeyError: 'paragrafnummer'

Räknar antal av varje klass i datan

In [None]:
# def count_classes(csv_file):
#     """
#     Count the number of instances of each class in the first column of a CSV file.

#     Parameters:
#     - csv_file: Path to the CSV file.

#     Returns:
#     - class_counts: Dictionary containing the count of instances for each class.
#     """
#     # Read the CSV file into a DataFrame
#     df = pd.read_csv(csv_file, header=None)

#     # Count the occurrences of each class in the first column
#     class_counts = df[0].value_counts().to_dict()

#     return class_counts

# # Example usage
# class_counts = count_classes("testdata - Blad1 (2).csv")
# print(class_counts)