In [1]:
import pandas as pd
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


folder = "C:/Users/raoul/Documents/UpworkProjects/json_league_mapping"

underdog_file_path = os.path.join(folder, "underdog.json")
prize_file_path = os.path.join(folder, "prizepick.json")


# Load data from JSON files
with open(underdog_file_path, "r") as f:
    underdog_data = json.load(f)

with open(prize_file_path, "r") as f:
    prizepick_data = json.load(f)

In [2]:
#Check underdog keys missing in Prizes 
print(set(underdog_data.keys()) - (set(prizepick_data.keys())))

{'DOTA2', 'NASCAR_SZN', 'CBB', 'LOL', 'TGL'}


In [3]:
# Fucntion to standardize stat_types
def normalize_stat_type(stat_type):
    stat_type = stat_type.lower()
    stat_type = ''.join(ch for ch in stat_type if ch.isalnum() or ch.isspace())
    return stat_type

In [5]:
# Extract stat_type values
similarity_threshold = 0.8  # Adjust as needed

for league_name, current_underdog_data in underdog_data.items():

    if league_name not in list(prizepick_data.keys()):
        print(f"{league_name} not in Prize file")
        continue
    
    underdog_stats = set()
    for match in current_underdog_data:
        for stat in match["stats"]:
            underdog_stats.add(stat["stat_type"])

    prizepick_stats = set()
    for match in prizepick_data[league_name]:
        for stat in match["stats"]:
            prizepick_stats.add(stat["stat_type"])

    # Calculate similarity and map values
    vectorizer = TfidfVectorizer()
    all_stats = underdog_stats.union(prizepick_stats)
    tfidf_matrix = vectorizer.fit_transform(all_stats)

    stat_type_mapping = {}
    for underdog_stat in underdog_stats:
        underdog_stat_norm = normalize_stat_type(underdog_stat)
        underdog_vector = vectorizer.transform([underdog_stat_norm])

        for prizepick_stat in prizepick_stats:
            prizepick_stat_norm = normalize_stat_type(prizepick_stat)
            prizepick_vector = vectorizer.transform([prizepick_stat_norm])

            similarity = cosine_similarity(underdog_vector, prizepick_vector)[0][0]
            if similarity >= similarity_threshold:
                stat_type_mapping[underdog_stat] = prizepick_stat
                break 

    for underdog_stat, prizepick_stat in stat_type_mapping.items():
        print(f"League: {league_name}, Underdog: {underdog_stat} -> Prizepick: {prizepick_stat}")


League: NFL, Underdog: Longest Rush -> Prizepick: Longest Rush
League: NFL, Underdog: Longest Reception -> Prizepick: Longest Reception
League: NFL, Underdog: Receiving Yards -> Prizepick: Receiving Yards
League: NFL, Underdog: Pass + Rush Yards -> Prizepick: Pass Yards
League: NFL, Underdog: Kicking Points -> Prizepick: Kicking Points
League: NFL, Underdog: Sacks Taken -> Prizepick: Sacks Taken
League: NFL, Underdog: Receptions -> Prizepick: Receptions
League: NFL, Underdog: FG Made -> Prizepick: FG Made
League: NBA, Underdog: Assists -> Prizepick: Assists
League: NBA, Underdog: Points -> Prizepick: Points
League: NBA, Underdog: Rebounds -> Prizepick: Rebounds
League: NBA, Underdog: 3-Pointers Made -> Prizepick: 3-PT Made
League: NBA, Underdog: Steals -> Prizepick: Steals
League: NBA, Underdog: Turnovers -> Prizepick: Turnovers
League: CFB, Underdog: Longest Rush -> Prizepick: Longest Rush
League: CFB, Underdog: Longest Reception -> Prizepick: Longest Reception
League: CFB, Underdog: 