# Milestone II Project: Pokemon Showdown Battle Predictor

This notebook contains the code used to format JSON battle replay logs scraped from [Pokemon Showdown](https://replay.pokemonshowdown.com/).

## Imports

In [7]:
import json
import pandas as pd
import glob
import os

## Helper Functions

In [75]:
def parse_showteam(log, player_tag):
    """
    Extract team info for a player (p1 or p2) from the log.
    Returns a list of dicts, one per Pokémon slot.
    """
    for line in log.split('\n'):
        if line.startswith(f'|showteam|{player_tag}|'):
            team_str = line.split('|', 3)[-1]
            pokes = [p.strip('[]') for p in team_str.split(']') if p.strip()]
            team = []
            for poke in pokes:
                # Format: Name||Item|Ability|Move1,Move2,Move3,Move4||||||50|,,,,,Normal
                sections = poke.split('|')
                name = sections[0]
                item = sections[2] if len(sections) > 2 else None
                ability = sections[3] if len(sections) > 3 else None
                moves = sections[4].split(',') if len(sections) > 4 else []
                # Clean up whitespace and empty moves
                moves = [m.strip() for m in moves if m.strip()]
                team.append({
                    'species': name.strip() if name else None,
                    'item': item.strip() if item else None,
                    'ability': ability.strip() if ability else None,
                    'moves': moves
                })
            return team
    return []

def parse_winner(log, player1, player2):
    for line in log.split('\n'):
        if line.startswith('|win|'):
            winner = line.split('|')[2].strip()
            if winner == player1:
                return 1
            elif winner == player2:
                return 0
    return None  # Could not determine winner

    
def canonicalize_team(team, max_pokemon=6, max_moves=4):
    """
    Sort team by species for order invariance and pad to max_pokemon.
    Each moveset is also padded to max_moves.
    """
    # Remove empty species (can happen with some bad data)
    sorted_team = sorted(
        [poke for poke in team if poke['species']],
        key=lambda poke: poke['species'] or ""
    )
    # Pad to max_pokemon
    while len(sorted_team) < max_pokemon:
        sorted_team.append({'species': None, 'item': None, 'ability': None, 'moves': [None]*max_moves})
    # Ensure moves are fixed length
    for poke in sorted_team:
        moves = poke.get('moves', [])
        poke['moves'] = moves + [None]*(max_moves - len(moves))
        poke['moves'] = poke['moves'][:max_moves]
    return sorted_team

def team_to_flat_features(team, prefix, max_pokemon=6, max_moves=4):
    """
    Flatten team into feature dict for DataFrame row.
    """
    features = {}
    for i, poke in enumerate(team):
        idx = i + 1
        features[f'{prefix}_species_{idx}'] = poke['species']
        features[f'{prefix}_item_{idx}'] = poke['item']
        features[f'{prefix}_ability_{idx}'] = poke['ability']
        for m, move in enumerate(poke['moves']):
            features[f'{prefix}_move_{idx}_{m+1}'] = move
    return features

def process_file(json_path, max_pokemon=6, max_moves=4):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    log = data['log']
    players = data['players']
    # Defensive: make sure there are 2 players
    if len(players) != 2:
        return None
    p1_team = canonicalize_team(parse_showteam(log, 'p1'), max_pokemon=max_pokemon, max_moves=max_moves)
    p2_team = canonicalize_team(parse_showteam(log, 'p2'), max_pokemon=max_pokemon, max_moves=max_moves)
    outcome = parse_winner(log, players[0], players[1])
    return {
        'p1_team': p1_team,
        'p2_team': p2_team,
        'p1_player': players[0],
        'p2_player': players[1],
        'p1_win': outcome,
    }

### MAIN BATCH PROCESSING

def build_dataset_from_jsons(directory, max_pokemon=6, max_moves=4):
    data_rows = []
    for file in glob.glob(os.path.join(directory, '*.json')):
        game = process_file(file, max_pokemon=max_pokemon, max_moves=max_moves)
        if game is None:
            continue
        row = {}
        row.update(team_to_flat_features(game['p1_team'], 'p1', max_pokemon=max_pokemon, max_moves=max_moves))
        row.update(team_to_flat_features(game['p2_team'], 'p2', max_pokemon=max_pokemon, max_moves=max_moves))
        row['p1_win'] = game['p1_win']
        row['p1_player'] = game['p1_player']
        row['p2_player'] = game['p2_player']
        data_rows.append(row)
    df = pd.DataFrame(data_rows)
    return df

# Usage:
# df = build_dataset_from_jsons("replays")
# print(df.head())

In [83]:
# df.to_csv("battle_data.csv", index=False)

In [90]:
len(df.columns)

87