# Pattern matching on competition data

In [57]:
import json
import pandas as pd
import re

## Raw Text Data

In [58]:
raw_data_paths = {
    'kaggle': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/raw_scraping/kaggle_data_first_page_2.json',
    'aicrowd': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/raw_scraping/aicrowd_raw.json', 
    'drivendata': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/raw_scraping/drivendata_filtered_results.json'
}

In [59]:
raw_data = {}
for platform, path in raw_data_paths.items():
    try:
        with open(path, 'r', encoding='utf-8') as f:
            raw_data[platform] = json.load(f)
        print(f"Loaded {len(raw_data[platform])} raw competitions from {platform}")
    except FileNotFoundError:
        print(f"Warning: {path} not found")
    except json.JSONDecodeError:
        print(f"Warning: Error parsing JSON from {path}")

Loaded 300 raw competitions from kaggle
Loaded 221 raw competitions from aicrowd
Loaded 65 raw competitions from drivendata


In [60]:
raw_dfs = {}
for platform, data in raw_data.items():
    if data:
        raw_dfs[platform] = pd.DataFrame(data)
        print(f"Created DataFrame for {platform} with {len(raw_dfs[platform])} rows")

Created DataFrame for kaggle with 300 rows
Created DataFrame for aicrowd with 221 rows
Created DataFrame for drivendata with 65 rows


In [61]:
raw_dfs["kaggle"].head()

Unnamed: 0,name,url,overview_text,description_text,dataset_text
0,CPROD1: Consumer PRODucts contest #1,https://www.kaggle.com/competitions/cprod1,Overview text not found,A significant proportion of web usage relates ...,The CPROD1 competition involves the release of...
1,EMC Israel Data Science Challenge,https://www.kaggle.com/competitions/emc-data-s...,Overview text not found,The EMC source code classification challenge r...,Dataset description not found
2,Practice Fusion Diabetes Classification,https://www.kaggle.com/competitions/pf2012-dia...,Overview text not found,In the first phase of this prediction challeng...,Update: this dataset has been removed at the r...
3,Detecting Insults in Social Commentary,https://www.kaggle.com/competitions/detecting-...,Overview text not found,The challenge is to detect when a comment fr...,The data consists of a label column followed b...
4,Cause-effect pairs,https://www.kaggle.com/competitions/cause-effe...,Overview text not found,Come to our NIPS workshop (dec 9 or 10 in Taho...,"This is the July 1, 2013 final data release. ..."


## Keyword Dictionary

In [62]:
ethics_keywords = {
    'fairness_bias': {
        'fairness', 'bias', 'biased', 'unbiased', 'discrimination',
        'equity', 'equitable', 'disparate', 'protected class',
        'underrepresented', 'minority', 'marginalized', 'algorithmic bias'
    },
    
    'data_privacy': {
        'privacy', 'confidential', 'sensitive',
        'PII', 'personal data', 'personal information',
        'GDPR', 'consent', 'anonymized', 'pseudonymized',
        'de-identified', 'data protection', 'right to be forgotten', 'biometric'
    },
    
    'red_teaming': { 
        'red team', 'red teaming', 'redteaming', 'adversarial', 
        'attack', 'exploit', 'exploitation', 'vulnerability',
        'penetration testing', 'offensive security'

    },
    
    'transparency_interpretability': {
        'transparency', 'transparent', 'explainable', 'explainability', 'XAI',
        'interpretable', 'interpretability', 'black box', 'white box', 'glass box',
        'model explanation', 'feature importance', 'audit', 'accountability',
        'traceability', 'traceable'
    },
    
    'toy_competition': {
        'toy', 'toy problem', 'practice', 'practice problem',
        'learning', 'learning exercise', 'learning project',
        'educational', 'tutorial', 'demo', 'demonstration',
        'example', 'sample', 'beginner', 'beginner friendly',
        'introductory', 'starter', 'getting started', 'sandbox', 'playground'
    }
}

## Regex Patterns

In [63]:
def create_regex_patterns(keyword_dicts):
    regex_patterns = {}

    for category, keywords in keyword_dicts.items():
        patterns = []
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword) + r'\b'
            patterns.append(pattern)
        
        combined_pattern = '|'.join(patterns)
        regex_patterns[category] = re.compile(combined_pattern, re.IGNORECASE)

    return regex_patterns


In [64]:
regex_patterns = create_regex_patterns(ethics_keywords)

#### Helper Functions

In [65]:
def check_keywords_in_text(text, category):
    """
    Check if text contains keywords for a specific category
    """
    if category not in regex_patterns:
        return False, []
    
    pattern = regex_patterns[category]
    matches = pattern.findall(text)
    
    unique_matches = list(set([match.lower() for match in matches]))
    
    return len(unique_matches) > 0, unique_matches

In [66]:
def check_fairness_bias(text: str):
    return check_keywords_in_text(text, 'fairness_bias')

def check_data_privacy(text: str):
    return check_keywords_in_text(text, 'data_privacy')

def check_red_teaming(text: str):
    return check_keywords_in_text(text, 'red_teaming')

def check_transparency(text: str):
    return check_keywords_in_text(text, 'transparency_interpretability')

def check_toy_competition(text: str):
    return check_keywords_in_text(text, 'toy_competition')

In [67]:
def analyze_all_ethics_categories(text: str):
    results = {}
    
    categories = ['fairness_bias', 'data_privacy', 'red_teaming', 'transparency_interpretability', 'toy_competition']
    
    for category in categories:
        found, matches = check_keywords_in_text(text, category)
        results[category] = {
            'found': found,
            'matches': matches,
            'match_count': len(matches)
        }
    
    return results

In [68]:
def analyze_competition_ethics(competition_data):
    """
    Analyze a competition for all ethical considerations
    """
    text_fields = [
        competition_data.get('name', ''),
        competition_data.get('description_text', ''),
        competition_data.get('dataset_text', ''),
        competition_data.get('overview_text', '')
    ]
    
    combined_text = ' '.join([field for field in text_fields if field])
    
    return analyze_all_ethics_categories(combined_text)


## Execute

In [69]:
def analyze_competition_ethics_json(competition_data):
    ethics_analysis = analyze_competition_ethics(competition_data)
    
    result = {
        "name": competition_data.get('name', ''),
        "url": competition_data.get('url', '')
    }
    
    fairness = ethics_analysis.get('fairness_bias', {})
    result['fairness_bias_mentioned'] = 'yes' if fairness.get('found') else 'no'
    result['how_fairness'] = ', '.join(fairness.get('matches', [])) if fairness.get('found') else 'n/a'
    
    privacy = ethics_analysis.get('data_privacy', {})
    result['data_privacy'] = 'yes' if privacy.get('found') else 'no'
    result['how_data_privacy'] = ', '.join(privacy.get('matches', [])) if privacy.get('found') else 'n/a'
    
    toy = ethics_analysis.get('toy_competition', {})
    result['toy'] = 'yes' if toy.get('found') else 'no'
    result['how_toy'] = ', '.join(toy.get('matches', [])) if toy.get('found') else 'n/a'

    
    red = ethics_analysis.get('red_teaming', {})
    result['red_team'] = 'yes' if red.get('found') else 'no'
    result['how_red_team'] = ', '.join(red.get('matches', [])) if red.get('found') else 'n/a'

    
    trans = ethics_analysis.get('transparency_interpretability', {})
    result['transparency_mentioned'] = 'yes' if trans.get('found') else 'no'
    result['how_transparency'] = ', '.join(trans.get('matches', [])) if trans.get('found') else 'n/a'
    
    return result

In [70]:
# Test
sample_competition = raw_data['kaggle'][2]
json_result = analyze_competition_ethics_json(sample_competition)

print(json.dumps(json_result, indent=2))

{
  "name": "Practice Fusion Diabetes Classification",
  "url": "https://www.kaggle.com/competitions/pf2012-diabetes",
  "fairness_bias_mentioned": "no",
  "how_fairness": "n/a",
  "data_privacy": "yes",
  "how_data_privacy": "de-identified",
  "toy": "yes",
  "how_toy": "starter, practice",
  "red_team": "no",
  "how_red_team": "n/a",
  "transparency_mentioned": "no",
  "how_transparency": "n/a"
}


## Execute on raw data

In [71]:
# Process all three datasets
platforms = ['kaggle', 'aicrowd', 'drivendata']
results = {}

for platform in platforms:
    print(f"Processing {platform} dataset...")
    
    # Initialize list for this platform
    platform_results = []
    
    # Process each competition in the platform
    for i, competition in enumerate(raw_data[platform]):
        try:
            result = analyze_competition_ethics_json(competition)
            platform_results.append(result)
            
            # Progress indicator every 50 competitions
            if (i + 1) % 50 == 0:
                print(f"  Processed {i + 1} competitions...")
                
        except Exception as e:
            print(f"  Error processing competition {i}: {e}")
            continue
    
    results[platform] = platform_results
    print(f"Completed {platform}: {len(platform_results)} competitions processed\n")

# Save results to JSON files
output_paths = {
    'kaggle': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/kaggle_results/all_pattern_match_results.json',
    'aicrowd': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/aicrowd_results/all_pattern_match_results.json',
    'drivendata': '/Users/hadleydixon/Desktop/gamification_data_analysis/Data/drivendata_results/all_pattern_match_results.json'
}

for platform, results_list in results.items():
    output_path = output_paths[platform]
    
    # Ensure directory exists
    import os
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results_list, f, indent=2, ensure_ascii=False)
    
    print(f"Saved {len(results_list)} results to {output_path}")

print("\nAll pattern matching results saved!")

Processing kaggle dataset...
  Processed 50 competitions...
  Processed 100 competitions...
  Processed 150 competitions...
  Processed 200 competitions...
  Processed 250 competitions...
  Processed 300 competitions...
Completed kaggle: 300 competitions processed

Processing aicrowd dataset...
  Processed 50 competitions...
  Processed 100 competitions...
  Processed 150 competitions...
  Processed 200 competitions...
Completed aicrowd: 221 competitions processed

Processing drivendata dataset...
  Processed 50 competitions...
Completed drivendata: 65 competitions processed

Saved 300 results to /Users/hadleydixon/Desktop/gamification_data_analysis/Data/kaggle_results/all_pattern_match_results.json
Saved 221 results to /Users/hadleydixon/Desktop/gamification_data_analysis/Data/aicrowd_results/all_pattern_match_results.json
Saved 65 results to /Users/hadleydixon/Desktop/gamification_data_analysis/Data/drivendata_results/all_pattern_match_results.json

All pattern matching results saved