# Bulk Card data EDA

In [None]:
import json,os,sys,gzip
import pandas as pd
import numpy as np
from tithe_extractor.scryfall_api import make_api_request
from tithe_extractor.constants import HEADERS, TIMEOUT
from tithe_extractor.datautils import load_raw_cards_data

# Paths
paths_dict = json.load(open('paths.json'))
metadata_path = paths_dict['metadata']  # write your own path here - metadata path
card_json_path = paths_dict['cards_json']  # write your own path here - card path
card_csv_path = paths_dict['cards_csv']  # write your own path here - card csv path

## Download card data (if you have metadata)

In [None]:
# Get the bulk card data path from the metadata
metadata = json.loads(open(metadata_path).read())
download_uri = metadata['data'][0]['download_uri']
print(download_uri)

In [None]:
# Download the bulk card data
response = make_api_request(download_uri,headers=HEADERS, timeout=TIMEOUT)

In [None]:
# Look at one of the json objects in the response
response.json()[0]

In [None]:
# Save it to a file (uncompressed - assuming it's not too big)
with open(card_json_path, "w+", encoding="utf-8") as f:
        for item in response.json():
            json.dump(item, f)
            f.write("\n")
        print("Bulk card data saved to file.")

# Load the card data

### If you have the card data saved to a csv:

In [None]:
# If you already have the bulk card data saved to a file, you can load it like this
df = load_raw_cards_data(card_csv_path)

### If you don't have the card data saved to csv:

In [None]:
# create a list to store extracted json objects
extracted_objs = []
err_count = 0
line_count = 0
# open the file in read mode
with open(card_json_path, 'rt') as file:
    # Iterate over each line
    for line in file:
        # Parse the JSON object from the current line
        try:
            json_obj = json.loads(line)
            extracted_objs.append(json_obj)
            line_count += 1
        except json.JSONDecodeError:
            print('Line is not a valid JSON object')
            err_count += 1
print(f"Extracted {line_count} JSON objects with {err_count} errors.")


## EDA

### Validate Saving/Loading the DataFrame to/from file

In [None]:
extracted_objs[0]

In [None]:
# Now is the time to start thinking about modeling the data. We can start by examining the keys of the first object in the list.
x = [key for key in extracted_objs[0].keys()]
len(x)

In [None]:
# let's print the keys in a readable way
for i in range(0, len(x), 5):
    print(x[i:i+5])

In [None]:
# Some keys of interest
keys_of_interest = [
    'id',
    'oracle_id',
    'name',
    'mana_cost',
    'cmc',
    'type_line',
    'oracle_text',
    'colors',
    'color_identity',
    'set',
    'rarity',
    'power',
    'toughness',
    'loyalty',
    'keywords',
    'legalities',
    'game_changer',
    'edhrec_rank',
    'prices',
    'rulings_uri',
    'related_uris',
    'purchase_uris',
    'image_uris'
]

In [None]:
# Create a pandas DataFrame from the extracted objects using keys of interest
df = pd.DataFrame(extracted_objs)[keys_of_interest]
df.head()

In [None]:
df.dtypes

In [None]:
# Cast the object columns to string
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)
df.dtypes

In [None]:
df.head()

In [None]:
# Save out this dataframe to a csv file
df.to_csv(card_csv_path, index=False)

In [None]:
df2 = pd.read_csv(card_csv_path,keep_default_na=False)
df2.head()

In [None]:
df2.dtypes

In [None]:
# Need to convert to float
df2['edhrec_rank'] = df2['edhrec_rank'].replace('', np.nan) # replace empty strings with NaN
df2['edhrec_rank'] = df2['edhrec_rank'].astype(float)

In [None]:
# Check for mismatches between the original and reloaded dataframes
mismatched = df[df != df2]
# Look at the mismatches and see if you can account for them all before relying on the reloaded data.
mismatched.isna().sum()

In [None]:
# Let's define a function to load the cards.csv data and transform it correctly
def load_raw_cards_data(path):
    """"
    Load the raw cards data from the specified path and cast the columns to the correct data types to match the original data loaded from json.
    """
    # Load the data
    df = pd.read_csv(path, keep_default_na=False)
    # Cast the object columns to string
    # for col in df.select_dtypes(include='object').columns:
    #     df[col] = df[col].astype(str)
    # Cast the edhrec_rank column to float
    df['edhrec_rank'] = df['edhrec_rank'].replace('', np.nan) # replace empty strings with NaN
    df['edhrec_rank'] = df['edhrec_rank'].astype(float)
    return df

In [None]:
# quickly validate the function
df3 = load_raw_cards_data(card_csv_path)
mismatched = df2[df != df3]
mismatched.isna().sum()

With that we are all good to start exploring the data!

### Data Cleaning

#### Numeric Values (cmc)

In [None]:
# Quick look at our numeric columns
df.describe()

In [None]:
# CMC has a huge range, let's look at the distribution
df['cmc'].value_counts().sort_index()

In [None]:
# Might be worth it to just remove the cards with weird cmc values
df = df[df['cmc'].isin(range(16))] # Get rid of cards with cmc > 15
df = df[df['cmc'] != 0.05] # Get rid of cards with cmc = 0.05
# Look at the value counts again
df['cmc'].value_counts().sort_index()

#### Next Section

In [None]:
df.columns

In [None]:
df.loc[:10,'mana_cost':'cmc']

In [None]:
import re

# Let's encode the mana cost by the number of colored mana symbols
# First we will make a function to count the number of colored mana symbols of each color in a string
def count_colored_mana_symbols(mana_cost):
    """
    Count the number of each colored mana symbol in a mana cost string.
    """
    # Define the colored mana symbols
    colored_mana_symbols = ['W', 'U', 'B', 'R', 'G']
    # Initialize the count dictionary
    count_dict = {symbol: 0 for symbol in colored_mana_symbols}
    count_dict['C'] = 0  # Colorless mana
    count_dict['uncolored'] = 0  # Uncolored mana

    # Iterate over the colored mana symbols
    for symbol in colored_mana_symbols:
        # Count the number of times the symbol appears in the mana cost
        count_dict[symbol] = mana_cost.count(symbol)
    
    # Count the number of colorless mana symbols
    count_dict['C'] = mana_cost.count('{C}')
    
    # Count the number of uncolored mana symbols (indicated by a number)
    uncolored_mana = re.findall(r'\{(\d+)\}', mana_cost)
    count_dict['uncolored'] = sum(int(x) for x in uncolored_mana)
    
    return count_dict

In [None]:
# Next we test the function and also check out the different mana cost values. Also need to account for split costs like {W/U}