In [1]:
import pandas as pd 
import json

In [97]:
def calculate_relationship_accuracy(actual_relationships, predicted_relationships):
    # Initialize variables to count correct matches and partial matches
    correct_matches = 0
    partial_matches = 0
    
    # Check for correct matches and partial matches
    for a_rel in actual_relationships:
        for p_rel in predicted_relationships:
            print("Comparing:", a_rel, p_rel)
            if (a_rel['from'] == p_rel['from'] and
                a_rel['relation'] == p_rel['relation'] and
                a_rel['target'] == p_rel['target']):
                correct_matches += 1
                print("Correct match found.")
            elif (a_rel['from'] == p_rel['target'] and
                  a_rel['relation'] == p_rel['relation'] and
                  a_rel['target'] == p_rel['from']):
                partial_matches += 1
                print("Partial match found.")
            else:
                print("No match found.")
    
    # Calculate accuracy
    total_relationships = len(actual_relationships)
    if total_relationships > 0:
        accuracy = (correct_matches + partial_matches) / total_relationships
    else:
        accuracy = 0
    
    return accuracy

# Actual and predicted relationships (identical)
actual_relationships = [{"from": "Norge", "relation": "contains", "target": "Oslo"}]
predicted_relationships = [{"from": "Norge", "relation": "contains", "target": "Sogndal"}]

# Calculate accuracy
accuracy = calculate_relationship_accuracy(actual_relationships, predicted_relationships)
print("Accuracy:", accuracy)


Comparing: {'from': 'Norge', 'relation': 'contains', 'target': 'Oslo'} {'from': 'Norge', 'relation': 'contains', 'target': 'Sogndal'}
No match found.
Accuracy: 0.0


In [11]:
def calculate_relationship_accuracy(actual_relationships, predicted_relationships):
    # Initialize variables to count correct matches and partial matches
    correct_matches = 0
    partial_matches = 0
    
    # Check for correct matches and partial matches
    for a_rel in actual_relationships:
        match_found = False  # Track whether a match is found for the current actual relationship
        for p_rel in predicted_relationships:
            if (a_rel['from'] == p_rel['from'] and
                a_rel['relation'] == p_rel['relation'] and
                a_rel['target'] == p_rel['target']):
                correct_matches += 1
                match_found = True
                break
            elif (a_rel['from'] == p_rel['target'] and
                  a_rel['relation'] == p_rel['relation'] and
                  a_rel['target'] == p_rel['from']):
                partial_matches += 0.5  # Assign a weight of 0.5 for each partial match
                match_found = True
                break  # Exit the loop once a partial match is found
        
        # If no match is found for the current actual relationship, count it as a partial match
        if not match_found:
            partial_matches += 0.5
    
    # Calculate accuracy
    total_relationships = len(actual_relationships)
    if total_relationships > 0:
        accuracy = (correct_matches + partial_matches) / total_relationships
    else:
        accuracy = 0
    
    return accuracy


# Actual and predicted relationships (identical)
actual_relationships = [{"from": "Norge", "relation": "contains", "target": "Sogndal"}]
predicted_relationships = [{"from": "Norge", "relation": "contains", "target": "Sogndal"}]

# Calculate accuracy
accuracy = calculate_relationship_accuracy(actual_relationships, predicted_relationships)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [71]:
def calculate_relationship_accuracy(actual_relationships, predicted_relationships):
    # Initialize variables to count correct matches, partial matches, and incorrect matches
    correct_matches = 0
    partial_matches = 0
    
    # Convert tuples to dictionaries for easier comparison
    actual_set = {(rel['from'], rel['relation'], rel['target']) if isinstance(rel, dict) else rel for rel in actual_relationships}
    predicted_set = {(rel['from'], rel['relation'], rel['target']) if isinstance(rel, dict) else rel for rel in predicted_relationships}
    
    # Check for correct matches, partial matches, and incorrect matches
    for rel in actual_set:
        if rel in predicted_set:
            correct_matches += 1
        else:
            # Check for partial matches
            partial_found = False
            for pred_rel in predicted_set:
                if ((rel[0] == pred_rel[0] and rel[1] == pred_rel[1] and rel[2] != pred_rel[2]) or
                    (rel[0] != pred_rel[0] and rel[1] == pred_rel[1] and rel[2] == pred_rel[2])):
                    partial_matches += 0.5
                    partial_found = True
                    break
            if not partial_found:
                # If no partial match found, count it as incorrect
                partial_matches += 0
    
    # Calculate accuracy
    total_relationships = len(actual_relationships)
    if total_relationships > 0:
        accuracy = (correct_matches + partial_matches) / total_relationships
    else:
        accuracy = 0
    
    return accuracy

# Actual and predicted relationships (identical)
actual_relationships = [{"from": "Norge", "relation": "contains", "target": "Sgondal"}]
predicted_relationships = [{"from": "Norge", "relation": "contains", "target": "Sogndal"}]

# Calculate accuracy
accuracy = calculate_relationship_accuracy(actual_relationships, predicted_relationships)
print("Accuracy:", accuracy)


Accuracy: 0.5


In [139]:
def calculate_accuracy(row):
    predicted = json.loads(row['predictedgpt3'])
    actual = json.loads(row['actual'])

    # Convert relationships to lowercase keys for consistency
    def convert_relationships(obj):
        return [{k.lower(): v for k, v in rel.items()} for rel in obj.get('relationships', [])]

    predicted['relationships'] = convert_relationships(predicted)
    actual['relationships'] = convert_relationships(actual)

    # Initialize accuracy and accuracy components
    location_accuracy = 0
    relationship_accuracy = 0

    # Compare number of locations
    predicted_locations = len(predicted.get('locations', []))
    actual_locations = len(actual.get('locations', []))
    if predicted_locations == actual_locations:
        location_accuracy = 1
    elif predicted_locations > actual_locations:
        location_accuracy = actual_locations / predicted_locations
    else:
        location_accuracy = predicted_locations / actual_locations

    # Extract predicted and actual relationships as dictionaries
    def tuple_to_dict(rel_tuple):
        if len(rel_tuple) == 3:
            return {'from': rel_tuple[0], 'relation': rel_tuple[1], 'target': rel_tuple[2]}
        else:
            return {'from': None, 'relation': None, 'target': None}


    predicted_relationships = [tuple_to_dict(rel) for rel in predicted.get('relationships', [])]
    actual_relationships = [tuple_to_dict(rel) for rel in actual.get('relationships', [])]

    # Check if predicted relationships is empty and convert it to the JSON format
    if not predicted_relationships:
        predicted_relationships = []
    # Check if actual relationships is empty and convert it to the JSON format
    if not actual_relationships:
        actual_relationships = []

    # Calculate relationship accuracy using the provided function
    relationship_accuracy = calculate_relationship_accuracy(actual_relationships, predicted_relationships)

    # Combine location accuracy and relationship accuracy
    overall_accuracy = (location_accuracy + relationship_accuracy) / 2

    print(f"Location Accuracy For Current Row: {location_accuracy}")
    print(f"Relationship Accuracy For Current Row: {relationship_accuracy}")
    print(f"Accuracy For Current Row: {overall_accuracy}")
    print("---------------------------------------------------")

    return overall_accuracy


In [140]:
def evaluate_accuracy(data):
    total_accuracy = 0
    total_locations = 0
    total_correct_locations = 0
    
    for index, row in data.iterrows():
        # Calculate accuracy for each row
        accuracy = calculate_accuracy(row)
        total_accuracy += accuracy
        
        # Extract actual and predicted locations
        actual_locations = len(json.loads(row['actual']).get('locations', []))
        predicted_locations = len(json.loads(row['predictedgpt3']).get('locations', []))
        
        # Accumulate total locations and total correct locations
        total_locations += actual_locations
        total_correct_locations += min(predicted_locations, actual_locations)
    
    # Calculate overall accuracy
    overall_accuracy = total_accuracy / len(data)
    
    # Calculate overall location accuracy
    overall_location_accuracy = total_correct_locations / total_locations
    
    return overall_accuracy, overall_location_accuracy


In [4]:
def convert_floats_to_strings(dataframe):
    for col in dataframe.columns:
        for i, value in enumerate(dataframe[col]):
            if isinstance(value, float):
                dataframe.at[i, col] = str(value)
    return dataframe

In [9]:
def fix_json_string(json_str):
    return json_str.replace('""', '"')

In [141]:
df = pd.read_csv('test_processedgpt3.csv')

In [142]:
# Apply the function to the DataFrame
df = convert_floats_to_strings(df)

  dataframe.at[i, col] = str(value)


In [143]:
df[["actual", "predictedgpt3"]]

Unnamed: 0,actual,predictedgpt3
0,"{""locations"": [{""name"": ""Norge"", ""type"": ""Loca...","{""locations"": [{""name"": ""Norge"", ""type"": ""Loca..."
1,"{""locations"": [{""name"": ""Voss"", ""type"": ""Locat...","{""locations"": [{""name"": ""Voss"", ""type"": ""Locat..."
2,"{""locations"": [{""name"": ""Nærøydalen"", ""type"": ...","{""locations"": [{""name"": ""Vestland"", ""type"": ""L..."
3,"{""locations"": [{""name"": ""Lærdalstunnelen"", ""ty...","{""locations"": [{""name"": ""SognOgFjordane"", ""typ..."
4,"{""locations"": [{""name"": ""SognOgFjordane"", ""typ...","{""locations"": [{""name"": ""Pensjonistforbundet"",..."
...,...,...
140,"{""locations"": [{""name"": ""Gaupne"", ""type"": ""Loc...","{""locations"": [{""name"": ""Gaupne"", ""type"": ""Loc..."
141,"{""locations"": [{""name"": ""Lavik"", ""type"": ""Loca...","{""locations"": [{""name"": ""Hellebøstranda"", ""typ..."
142,"{""locations"": [{""name"": ""Haukelifjell"", ""type""...","{""locations"": [{""name"": ""Vind"", ""type"": ""Locat..."
143,"{""locations"": [{""name"": ""MichaelKrohnGate"", ""t...","{""locations"": [{""name"": ""Mandag"", ""type"": ""Loc..."


In [144]:
overall_accuracy = evaluate_accuracy(df)

KeyError: 0

In [28]:
overall_accuracy

(0.11407919275118833, 0.9561586638830898)

In [63]:
pd.set_option('display.max_colwidth', None)
first_row_values = df.iloc[20][['actual', 'predictedgpt3']]

# Print out the values
print("Values of column1 and column2 on the first row:")
print(first_row_values)

Values of column1 and column2 on the first row:
actual                                                                                                                              {"locations": [{"name": "Kaupanger", "type": "Location", "entity": "Land"}, {"name": "Sogndal", "type": "Location", "entity": "Land"}, {"name": "Norge", "type": "Location", "entity": "Land"}, {"name": "Kristiansand", "type": "Location", "entity": "Land"}, {"name": "Voss", "type": "Location", "entity": "Land"}, {"name": "Sogndal", "type": "Location", "entity": "Land"}, {"name": "Førde", "type": "Location", "entity": "Land"}, {"name": "Oslo", "type": "Location", "entity": "Land"}, {"name": "Bergen", "type": "Location", "entity": "Land"}], "relationships": [{"from": "Norge", "relation": "contains", "target": "Sogndal"}]}
predictedgpt3    {"locations": [{"name": "Kaupanger", "type": "Location"}, {"name": "Sogndal", "type": "Location"}, {"name": "Voss", "type": "Location"}, {"name": "Førde", "type": "Location"}, {

In [11]:
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [17]:
def extract_float_values(obj):
    float_values = []
    if isinstance(obj, dict):
        for key, value in obj.items():
            if isinstance(value, (int, float)):
                float_values.append(value)
            elif isinstance(value, (list, dict)):
                float_values.extend(extract_float_values(value))
    elif isinstance(obj, list):
        for item in obj:
            float_values.extend(extract_float_values(item))
    return float_values

In [18]:
def parse_and_extract(json_str):
    try:
        data = json.loads(json_str)
        return extract_float_values(data)
    except json.JSONDecodeError:
        return []

In [14]:
float_values = {'actual': {}, 'predicted': {}}
for column in ['actual', 'predicted']:
    index_position = 0
    for index, value in df[column].items():
        if is_float(value):
            if column not in float_values:
                float_values[column] = {}
            float_values[column][index] = {'value': value, 'index_position': index_position}
        index_position += 1

# Print float values and their corresponding row index in a structured format
for column, values in float_values.items():
    print(f"Float values in '{column}' column:")
    for index, value_info in values.items():
        print(f"Row index: {index}, Value: {value_info['value']}, Index position in column: {value_info['index_position']}")
    print()

Float values in 'actual' column:
Row index: 13, Value: nan, Index position in column: 13

Float values in 'predicted' column:



In [43]:
def print_rows_without_from_in_relationships(df, column):
    for index, row in df.iterrows():
        relationships = row[column]
        if relationships.startswith('{') and relationships.endswith('}'):
            try:
                relationships_dict = json.loads(relationships)
                relationships_list = relationships_dict.get("relationships", [])
                if isinstance(relationships_list, list) and not any("from" in rel.get("from", "") for rel in relationships_list):
                    for i, rel in enumerate(relationships_list):
                        if "from" not in rel.get("from", ""):
                            print(f"Row {index}, {column}[{i}]: {rel}")
            except json.JSONDecodeError:
                pass


In [46]:
def fix_json_quotes(json_str):
    return json_str.replace("'", '"')

In [47]:
# Apply the helper function to the columns
df['actual'] = df['actual'].apply(fix_json_quotes)

In [48]:
print("Rows without 'from' in the 'relationships' array (unless empty) in the 'actual' column:")
print_rows_without_from_in_relationships(df, 'actual')

Rows without 'from' in the 'relationships' array (unless empty) in the 'actual' column:
Row 1, actual[0]: {'from': 'Voss', 'relation': 'contains', 'target': 'Tråstølen'}
Row 1, actual[1]: {'from': 'Voss', 'relation': 'isNearby', 'target': 'Lærdalen'}
Row 1, actual[2]: {'from': 'Gudvangtunnelen', 'relation': 'isPassingThrough', 'target': 'Aurland'}
Row 1, actual[3]: {'from': 'Sivletunnelen', 'relation': 'isPassingThrough', 'target': 'Voss'}
Row 1, actual[4]: {'from': 'Stalheimtunnelen', 'relation': 'isPassingThrough', 'target': 'Voss'}
Row 1, actual[5]: {'from': 'Voss', 'relation': 'contains', 'target': 'Nærøydalen'}
Row 1, actual[6]: {'from': 'Stad', 'relation': 'contains', 'target': 'Nordfjord'}
Row 1, actual[7]: {'from': 'Sauda', 'relation': 'isNearby', 'target': 'Bergen'}
Row 2, actual[0]: {'from': 'Lærdalstunnelen', 'relation': 'isPassingThrough', 'target': 'SognOgFjordane'}
Row 10, actual[0]: {'from': 'Norge', 'relation': 'contains', 'target': 'Bergen'}
Row 10, actual[1]: {'from':