In [1]:
#import mysql
import json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

## Downloand database

In [2]:
# def connect_to_database():
#     try:
#         with open('secret.txt', 'r') as file:
#             connection = mysql.connector.connect(
#                 host='localhost',
#                 database='magic_cards',
#                 user='root',
#                 password=file.readline()
#             )
#         if connection.is_connected():
#             print("Connected to MySQL database")
#             return connection
#     except mysql.connector.Error as e:
#         print(f"Error while connecting to MySQL: {e}")
#     return None

# def export_to_csv(connection, filename='magic_cards.csv'):
#     query = """
#     SELECT name, colorIdentity, keywords, supertypes
#     FROM cards
#     """
#     df = pd.read_sql(query, connection)
#     df.to_csv(filename, index=False)
#     print(f"Data exported to {filename}")



In [3]:
# Main execution
# connection = connect_to_database()
# if connection:
#     export_to_csv(connection)
#     connection.close()
# else:
#     print("Failed to connect to the database.")

## Analysis portion

In [None]:
def load_card_data(filename='cards.csv'):
	return pd.read_csv(filename)

def prepare_features(df):
	# Process colors
	color_map = {'W': 'White', 'U': 'Blue', 'B': 'Black', 'R': 'Red', 'G': 'Green'}
	df['colorIdentity'] = df['colorIdentity'].fillna('').apply(lambda x: [color_map.get(c.strip(), c.strip()) for c in x.split(',')] if isinstance(x, str) else [])
	mlb_colors = MultiLabelBinarizer()
	color_features = pd.DataFrame(
		mlb_colors.fit_transform(df['colorIdentity']),
		columns=mlb_colors.classes_,
		index=df.index
	)
	
	# Process keywords
	df['keywords'] = df['keywords'].fillna('').apply(lambda x: [kw.strip() for kw in x.split(',')] if isinstance(x, str) else [])
	mlb_keywords = MultiLabelBinarizer()
	keyword_features = pd.DataFrame(
		mlb_keywords.fit_transform(df['keywords']),
		columns=mlb_keywords.classes_,
		index=df.index
	)
	
	# Process supertypes
	df['is_legendary'] = df['supertypes'].fillna('').str.contains('Legendary')
	
	# Combine features
	features = pd.concat([color_features, keyword_features, df[['is_legendary']]], axis=1)
	
	return features, mlb_colors, mlb_keywords

def cluster_cards(features, n_clusters=10):
	kmeans = KMeans(n_clusters=n_clusters, random_state=69, n_init=10)
	return kmeans.fit_predict(features)

def find_related_cards(df, features, legendary_card_name, top_n=1000):
	legendary_card = df[df['name'] == legendary_card_name].index[0]
	legendary_features = features.iloc[legendary_card].values.reshape(1, -1)
	
	similarities = cosine_similarity(legendary_features, features)
	similar_indices = similarities[0].argsort()[::-1][1:top_n+1]
	
	return df.iloc[similar_indices]

In [5]:
df = load_card_data()
df.drop_duplicates(subset='name', keep="last")
features, mlb_colors, mlb_keywords = prepare_features(df)

# Perform clustering
clusters = cluster_cards(features)
df['cluster'] = clusters

  return pd.read_csv(filename)


In [6]:
print("Sample of clustered cards:")
print(df[['name', 'colorIdentity', 'keywords', 'is_legendary', 'cluster']].head(5))

Sample of clustered cards:
                name colorIdentity        keywords  is_legendary  cluster
0  Ancestor's Chosen       [White]  [First strike]         False        7
1  Ancestor's Chosen       [White]  [First strike]         False        7
2     Angel of Mercy       [White]        [Flying]         False        7
3     Angel of Mercy       [White]        [Flying]         False        7
4   Angelic Blessing       [White]              []         False        1


In [None]:
# Find related cards for a legendary creature
legendary_card_name = "Reya Dawnbringer"  # An example from your data
if legendary_card_name in df['name'].values:
	related_cards = find_related_cards(df, features, legendary_card_name)
	pd.set_option('display.max_rows', None)
	print(f"\nCards related to {legendary_card_name}:")
	print(related_cards[['name', 'colorIdentity', 'keywords', 'is_legendary', 'text']])
else:
	print(f"Legendary creature '{legendary_card_name}' not found in the database.")


Cards related to Reya Dawnbringer:
                                                    name   colorIdentity  \
13059                         Teshar, Ancestor's Apostle         [White]   
26166                                Zeriam, Golden Wind         [White]   
20525                         Teshar, Ancestor's Apostle         [White]   
76342                        Mavinda, Students' Advocate         [White]   
26242                                Zeriam, Golden Wind         [White]   
85668                        Mavinda, Students' Advocate         [White]   
79694                         Linvala, Keeper of Silence         [White]   
87725                                        Major Teroh         [White]   
76343                        Mavinda, Students' Advocate         [White]   
49                                      Reya Dawnbringer         [White]   
48                                      Reya Dawnbringer         [White]   
26677                                  Lieutenant Ki

In [8]:
# Print some statistics
print("\nCluster Statistics:")
print(df['cluster'].value_counts())

print("\nMost common keywords:")
print(df['keywords'].explode().value_counts().head(10))

print("\nColor distribution:")
print(df['colorIdentity'].explode().value_counts())


Cluster Statistics:
cluster
0    17077
6    15611
2    13611
4    10813
3    10324
1    10252
7     6593
8     6157
9     4250
5     3004
Name: count, dtype: int64

Most common keywords:
keywords
             57285
Flying        8811
Enchant       3294
Trample       2441
Haste         1784
Vigilance     1675
Equip         1463
Mill          1427
Flash         1384
Scry          1345
Name: count, dtype: int64

Color distribution:
colorIdentity
Green    22038
Black    21896
White    21690
Blue     21666
Red      21650
         10813
Name: count, dtype: int64


In [None]:
cards = pd.read_csv('cards.csv')
cards = cards[cards.availability.str.contains('paper')].drop_duplicates(subset='name', keep='last')
cards = cards[['name', 'colorIdentity', 'keywords', 'text', 'edhrecRank', 'manaValue', 'uuid', 'availability', 'isOnlineOnly', 'isTextless']]
print(len(cards.index))
with open('Keywords.json') as json_file:
	keywords = json.load(json_file)

#load dict of all keywords
keywords = pd.DataFrame({'keywords': keywords})

# Perform K-Means clustering based on the EDHRec Rank attribute

# Get range of values in legal card set
# valid_cards = cards[]
print(set(cards[cards.name == 'Shalai and Hallar']['colorIdentity'].split(', ')))

# Use elbow-knee to select best K

# Perform Apriori association rule mining based on the rules text similarity score

#split rules text into all possible permutations, then do this for each candidate and compare amount of matching options to increase support
# def split_rules(rules_text):
#     splits = []
#     #split at first occurrence and store, then second, and so on. then iterate and do it again
#     parsed_rules = rules_text.split("\\n")[-1].split(" ")

#     for i in range(len(parsed_rules)):
#         cut_rules = " ".join(parsed_rules[:len(parsed_rules) - i])
#         for j in range(len(cut_rules.split(" "))):
#             #split at last occurrence, then take the first value in split to get full string
#             sliced_rules = cut_rules.split("\\n")[-1].split(" ", j)
#             # for j in range(len(bruh) - i):
#             splits.append(sliced_rules[-1])
#     return splits

# def apriori(ref_name, related_cards):
#     # ref_name is L_1
#     candidates = []
#     rules_matrix = []

#     for text in related_cards['text'].tolist():
#         rules_matrix.append(split_rules(text))

	# Get initial comparison score
	  
	# While k < 63, append candidates
	
	# Run for each cluster starting with most related

	# Generate candidate based on highest average score against all in list
	# Iterate over existing list size k
	# Take average and find max among all cards


# cell_value = cards.where(cards=='Reya Dawnbringer').dropna(how='all').dropna(axis=1)
# # print(related_cards[['name', 'text']])
# #increase support value based on shared keywords, shared terms in rules text, and triggers
# pd.set_option('display.max_colwidth', None)
# #print(cards.loc[cell_value.index[-1], 'text']) 
# # rules = split_rules(cards.loc[cell_value.index[-1], 'text'])

# candidates = apriori(legendary_card_name, related_cards)
# for i in candidates: print(i)

  cards = pd.read_csv('cards.csv')


29639


AttributeError: 'set' object has no attribute 'split'

In [None]:
cards = pd.read_csv('cards.csv')
cards = cards[cards.availability.str.contains('paper')].drop_duplicates(subset='name', keep='last')
cards = cards[['name', 'colorIdentity', 'keywords', 'text', 'edhrecRank', 'manaValue', 'uuid', 'availability', 'isOnlineOnly', 'isTextless']]
print(len(cards.index))
with open('Keywords.json') as json_file:
	keywords = json.load(json_file)

#load dict of all keywords
keywords = pd.DataFrame({'keywords': keywords})

# Perform K-Means clustering based on the EDHRec Rank attribute

# Get range of values in legal card set
# valid_cards = cards[]
print(set(cards[cards.name == 'Shalai and Hallar']['colorIdentity'].iloc[0].split(', ')))

commander_identity = set(cards[cards.name == 'Shalai and Hallar']['colorIdentity'].iloc[0].split(', '))

# Get all indices of compliant cards
sum = 0
indices = []
print(cards.index)

# print(cards.index[0])
print(cards.loc[2684]['colorIdentity'])
for row in cards.index:
	if type(cards.loc[row]['colorIdentity']) is float:
		identity = set()
	else:
		identity = set(cards.loc[row]['colorIdentity'].split(', '))
	if identity <= commander_identity:
		indices.append(row)
		sum += 1
	
print(sum)

valid_cards = cards[cards.index.isin(indices)][['name', 'colorIdentity']]
	

  cards = pd.read_csv('cards.csv')


29639
{'G', 'R', 'W'}
Index([ 2684,  2686,  2688,  2690,  2692,  2694,  2696,  2698,  2700,  2702,
       ...
       97682, 97683, 97684, 97685, 97686, 97687, 97688, 97689, 97690, 97691],
      dtype='int64', length=29639)
W
17391
                                                    name colorIdentity
2684                         And They Shall Know No Fear             W
2686                         Celestine, the Living Saint             W
2688                               Defenders of Humanity             W
2690                                    For the Emperor!             W
2692                                 Grey Knight Paragon             W
2694                             Space Marine Devastator             W
2696                                  Space Marine Scout             W
2698                                 Thunderwolf Cavalry             W
2700                          Triumph of Saint Katherine             W
2702                           Ultramarines Honour Guard   