# Validate GPT results using pattern matching

In [42]:
import json
import pandas as pd
import re

## Processed GPT Text Data


In [43]:
gpt_data_paths = {
    'kaggle': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/kaggle_results/all_processed_kaggle.json',
    'aicrowd': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/aicrowd_results/all_processed_aicrowd.json',
    'drivendata': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/drivendata_results/all_processed_drivendata.json'
}

In [44]:
processed_gpt_results = {}
for platform, path in gpt_data_paths.items():
    try:
        with open(path, 'r', encoding='utf-8') as f:
            processed_gpt_results[platform] = json.load(f)
        print(f"Loaded {len(processed_gpt_results[platform])} processed results from {platform}")
    except FileNotFoundError:
        print(f"Warning: {path} not found")
    except json.JSONDecodeError:
        print(f"Warning: Error parsing JSON from {path}")

Loaded 300 processed results from kaggle
Loaded 221 processed results from aicrowd
Loaded 65 processed results from drivendata


In [45]:
processed_gpt_dfs = {}
for platform, data in processed_gpt_results.items():
    if data:
        processed_gpt_dfs[platform] = pd.DataFrame(data)
        print(f"Created DataFrame for {platform} with {len(processed_gpt_dfs[platform])} rows")

Created DataFrame for kaggle with 300 rows
Created DataFrame for aicrowd with 221 rows
Created DataFrame for drivendata with 65 rows


## Processed Pattern Matching

In [46]:
pattern_match_paths = {
    'kaggle': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/kaggle_results/all_pattern_match_results.json',
    'aicrowd': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/aicrowd_results/all_pattern_match_results.json',
    'drivendata': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/drivendata_results/all_pattern_match_results.json'
}

In [47]:
pattern_match_results = {}
for platform, path in pattern_match_paths.items():
    try:
        with open(path, 'r', encoding='utf-8') as f:
            pattern_match_results[platform] = json.load(f)
        print(f"Loaded {len(pattern_match_results[platform])} processed results from {platform}")
    except FileNotFoundError:
        print(f"Warning: {path} not found")
    except json.JSONDecodeError:
        print(f"Warning: Error parsing JSON from {path}")

Loaded 300 processed results from kaggle
Loaded 221 processed results from aicrowd
Loaded 65 processed results from drivendata


In [48]:
pattern_match_dfs = {}
for platform, data in pattern_match_results.items():
    if data:
        pattern_match_dfs[platform] = pd.DataFrame(data)
        print(f"Created DataFrame for {platform} with {len(pattern_match_dfs[platform])} rows")

Created DataFrame for kaggle with 300 rows
Created DataFrame for aicrowd with 221 rows
Created DataFrame for drivendata with 65 rows


In [49]:
pattern_match_dfs["kaggle"].head()

Unnamed: 0,name,url,fairness_bias_mentioned,how_fairness,data_privacy,how_data_privacy,toy,how_toy,red_team,how_red_team,transparency_mentioned,how_transparency
0,CPROD1: Consumer PRODucts contest #1,https://www.kaggle.com/competitions/cprod1,no,,no,,yes,"example, sample",no,,no,
1,EMC Israel Data Science Challenge,https://www.kaggle.com/competitions/emc-data-s...,no,,no,,no,,no,,no,
2,Practice Fusion Diabetes Classification,https://www.kaggle.com/competitions/pf2012-dia...,no,,yes,de-identified,yes,"starter, practice",no,,no,
3,Detecting Insults in Social Commentary,https://www.kaggle.com/competitions/detecting-...,no,,no,,no,,no,,no,
4,Cause-effect pairs,https://www.kaggle.com/competitions/cause-effe...,yes,bias,no,,yes,sample,no,,no,


In [50]:
processed_gpt_dfs["kaggle"].head()

Unnamed: 0,category,fairness_bias_mentioned,how_fairness,data_privacy,how_data_privacy,toy,red_team,transparency_mentioned,how_transparency,name,url
0,Natural Language Processing,no,,no,,no,no,no,,CPROD1: Consumer PRODucts contest #1,https://www.kaggle.com/competitions/cprod1
1,Data Science,no,,no,,no,no,no,,EMC Israel Data Science Challenge,https://www.kaggle.com/competitions/emc-data-s...
2,Healthcare,no,,yes,The data involves de-identified medical record...,no,no,no,n/a: The competition emphasizes a random fores...,Practice Fusion Diabetes Classification,https://www.kaggle.com/competitions/pf2012-dia...
3,Natural Language Processing,no,,yes,Potential privacy issues due to publicly sourc...,no,no,no,"n/a, the competition focuses on single-class c...",Detecting Insults in Social Commentary,https://www.kaggle.com/competitions/detecting-...
4,Data Science and Machine Learning,yes,Normalization and quantization were adjusted t...,no,,no,no,no,"n/a, focus is on ranking cause-effect pairs fo...",Cause-effect pairs,https://www.kaggle.com/competitions/cause-effe...


## Align Entries

In [None]:
gpt_kaggle_df = processed_gpt_dfs["kaggle"]
gpt_aicrowd_df = processed_gpt_dfs["aicrowd"]
gpt_drivendata_df = processed_gpt_dfs["drivendata"]

pattern_kaggle_df = pattern_match_dfs["kaggle"]
pattern_aicrowd_df = pattern_match_dfs["aicrowd"]
pattern_drivendata_df = pattern_match_dfs["drivendata"]

In [55]:
kaggle_merged_df = pd.merge(
    gpt_kaggle_df, pattern_kaggle_df,
    on='url',
    suffixes=('_gpt', '_pattern')
)

aicrowd_merged_df = pd.merge(
    gpt_aicrowd_df, pattern_aicrowd_df,
    on='url',
    suffixes=('_gpt', '_pattern')
)

drivendata_merged_df = pd.merge(
    gpt_drivendata_df, pattern_drivendata_df,
    on='url',
    suffixes=('_gpt', '_pattern')
)

## Compare Fields