In [None]:
#The purpose of this code is to convert a tabular dataset of MLS USA teams into a graph.
#Graph contains teams as nodes and similar stadium capacities as edges
#This will ultimately be used to determine similarities between teams of similar stadium capacities
#Broaded question: In what ways does stadium capacity affect a team?
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Direct read from GitHub raw content URL
url = "https://raw.githubusercontent.com/gavinr/usa-soccer/master/mls.csv"
teams_df = pd.read_csv(url)

#Extract Subsets
teams_df = teams_df[["team","city","state","latitude","longitude","stadium","stadium_capacity","joined","head_coach","url","wikipedia_url"]]

np.random.seed(42)  # for reproducibility
teams_df['win_percentage'] = np.random.uniform(0.3, 0.7, size=len(teams_df)).round(3)

#Sort DataFrama (Let's do by year joined)
teams_df = teams_df.sort_values(by="joined", ascending = True)
print("Number of Teams: ", teams_df.shape[0])
teams_df.head(100)
 
#Creating a graph and adding Team nodes to it
#Only keeping track of features that are important (Team location, Stadium Capacity, Win Percentage, etc.)
G = nx.Graph()

for _, row in teams_df.iterrows():
    G.add_node(row['team'], 
               capacity=row['stadium_capacity'],
               win_percentage=row['win_percentage'],
               latitude=row['latitude'],
               longitude=row['longitude'])
    
#This is a set value, +/- this number in the stadium capacity would determine if there is an edge and if there is, what is its weight
threshold = 2500
for i, team1 in teams_df.iterrows():
    for j, team2 in teams_df.iterrows():
        if i < j:  # Avoid duplicate edges
            cap_diff = abs(team1['stadium_capacity'] - team2['stadium_capacity'])
            if cap_diff <= threshold:
                # Edge weight is inversely proportional to capacity difference (closer to 2500 the difference, the smaller the weight will be)
                weight = 1 - (cap_diff / threshold)
                G.add_edge(team1['team'], team2['team'], 
                          weight=weight,
                          capacity_diff=cap_diff)

#Just for testing purposes      
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Create a new figure with size 15x15 inches
plt.figure(figsize=(15, 15))

# Calculate layout positions for each node using NetworkX's spring layout algorithm
# This algorithm treats edges like springs and tries to minimize edge crossings
# Basically for neatness purposes
pos = nx.spring_layout(G, k=0.5)  

# Draw the actual network
nx.draw(G,                   # The graph to draw
    pos,                     # Dictionary of positions for each node
    node_color='lightblue',  # Color of the team nodes
    node_size=1000,         # Size of the nodes in pixels
    with_labels=True,       # Show team names on nodes
    font_size=8,            # Size of team name text
    font_weight='bold',     # Make team names bold
    edge_color='gray',      # Color of connections between teams
    # List comprehension that gets the 'weight' attribute for each edge
    # thicker lines for more similar stadium capacities
    width=[G[u][v]['weight'] for u,v in G.edges()]  
)

# Add a title to the plot
plt.title("MLS Teams Connected by Similar Stadium Capacity (±2,500)")

# Display the plot
plt.show()