# Bulk Card data EDA

In [None]:
import json,os,sys,gzip
import pandas as pd
from tithe_extractor.scryfall_api import make_api_request
from tithe_extractor.constants import HEADERS, TIMEOUT
dpath =  # write your own path here - metadata path
cpath = # write your own path here - card path

## Download card data (if you have metadata)

In [None]:
# Get the bulk card data path from the metadata

metadata = json.loads(open(dpath).read())
download_uri = metadata['data'][0]['download_uri']
print(download_uri)

In [None]:
# Download the bulk card data
response = make_api_request(download_uri,headers=HEADERS, timeout=TIMEOUT)

In [None]:
# Look at one of the json objects in the response
response.json()[0]

In [None]:
# Save it to a file (uncompressed - assuming it's not too big)
with open(dpath, "w+", encoding="utf-8") as f:
        for item in response.json():
            json.dump(item, f)
            f.write("\n")
        print("Bulk data saved to file.")

# Load the card data

In [None]:
# create a list to store extracted json objects
extracted_objs = []
err_count = 0
line_count = 0
# open the file in read mode
with open(dpath, 'rt') as file:
    # Iterate over each line
    for line in file:
        # Parse the JSON object from the current line
        try:
            json_obj = json.loads(line)
            extracted_objs.append(json_obj)
            line_count += 1
        except json.JSONDecodeError:
            print('Line is not a valid JSON object')
            err_count += 1
print(f"Extracted {line_count} JSON objects with {err_count} errors.")


## EDA

In [None]:
extracted_objs[0]

In [None]:
# Now is the time to start thinking about modeling the data. We can start by examining the keys of the first object in the list.
x = [key for key in extracted_objs[0].keys()]
len(x)

In [None]:
# let's print the keys in a readable way
for i in range(0, len(x), 5):
    print(x[i:i+5])

In [None]:
# Some keys of interest
keys_of_interest = [
    'id',
    'oracle_id',
    'name',
    'mana_cost',
    'cmc',
    'type_line',
    'oracle_text',
    'colors',
    'color_identity',
    'set',
    'rarity',
    'power',
    'toughness',
    'loyalty',
    'keywords',
    'legalities',
    'game_changer',
    'edhrec_rank',
    'prices',
    'rulings_uri',
    'related_uris',
    'purchase_uris',
    'image_uris'
]

In [None]:
# Create a pandas DataFrame from the extracted objects using keys of interest
df = pd.DataFrame(extracted_objs)[keys_of_interest]
df.head()

In [None]:
df.dtypes

In [None]:
# Cast the object columns to string
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)
df.dtypes

In [None]:
df.head()

In [None]:
# Save out this dataframe to a csv file
df.to_csv(cpath, index=False)

In [None]:
df2 = pd.read_csv(cpath)
df2.head()

In [None]:
# Check for mismatches between the original and reloaded dataframes
mismatched = df[df != df2]
# Look at the mismatches and see if you can account for them all before relying on the reloaded data.

In [None]:
df['color_identity']

In [None]:
df2['color_identity'].astype(str).head()

In [None]:
df['mana_cost']

In [None]:
df2['mana_cost']

In [None]:
df2.isna().sum()

In [None]:
# In df we have 'nan' whereas df2 has NaN. We can replace 'nan' with panda's version of NA or NaN to ensure they count as the same thing.
mismatched.replace({'nan',pd.NA}, inplace=True) # Replace 'nan' with pd.NA