In [69]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as goimport matplotlib.cm as cm

In [3]:
#load data
stocks = pd.read_csv("all_stocks_5yr.csv")

In [4]:
stocks.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'Name'], dtype='object')

In [5]:
#drop unnessecary columns

In [6]:
stocks = stocks.drop(columns=['high', 'low', 'volume'])

In [7]:
stocks.columns

Index(['date', 'open', 'close', 'Name'], dtype='object')

In [8]:
#add percent return column
stocks['percent_return'] = (stocks['close'] - stocks['open']) / stocks['open']

In [9]:
stocks.head()

Unnamed: 0,date,open,close,Name,percent_return
0,2013-02-08,15.07,14.75,AAL,-0.021234
1,2013-02-11,14.89,14.46,AAL,-0.028878
2,2013-02-12,14.45,14.27,AAL,-0.012457
3,2013-02-13,14.3,14.66,AAL,0.025175
4,2013-02-14,14.94,13.99,AAL,-0.063588


In [10]:
#finds the correlation between percent_return of stock [name1] and stock [name2] [offset] days later
def correlation(name1, name2, offset, stocks_pivot):
    correlation = stocks_pivot[name1].corr(stocks_pivot[name2])
    return correlation

In [11]:
#finds the correlation between percent_return of stock [name1] and stock [name2] [offset] days later
def correlation_offset(name1, name2, offset):
    if offset < 0: return 'Offset must be non-negative'
    stocks_filtered = stocks.loc[(stocks['Name'] == name1) | (stocks['Name'] == name2)]
    stocks_pivot = stocks_filtered.pivot(index='date', columns='Name', values='percent_return')
    stocks_pivot[name2] = stocks_pivot[name2].shift(-offset)
    correlation = stocks_pivot[name1].corr(stocks_pivot[name2])
    return correlation

In [12]:
#sample call to correlation function
correlation_offset('NVDA', 'AMD', 1)

0.030439793761227307

In [13]:
#list of distinct stocks
unique = stocks['Name'].unique()
print(unique[:10])

['AAL' 'AAPL' 'AAP' 'ABBV' 'ABC' 'ABT' 'ACN' 'ADBE' 'ADI' 'ADM']


In [14]:
def populate_edges(unique, offset=1):
    if offset < 0: return 'Offset must be non-negative'
    stocks_pivot = stocks.pivot(index='date', columns='Name', values='percent_return')
    edges = np.zeros((len(unique),len(unique)))
    for i, stock2 in enumerate(unique):
        stocks_pivot[stock2] = stocks_pivot[stock2].shift(-offset)
        for j, stock1 in enumerate(unique):
            if edges[i][j] != 0: continue
            corr = correlation(stock1, stock2, offset, stocks_pivot)
            edges[i,j] = corr
            edges[j, i] = corr
        stocks_pivot[stock2] = stocks_pivot[stock2].shift(offset)
    return edges

In [15]:
#create adjacency matrices with naming convention weights{offset in days}
weights0 = populate_edges(unique, 0)
weights1 = populate_edges(unique, 1)
weights7 = populate_edges(unique, 7)
weights30 = populate_edges(unique, 30)

In [16]:
print(weights0.shape)
print(weights1.shape)
print(weights7.shape)
print(weights30.shape)

(505, 505)
(505, 505)
(505, 505)
(505, 505)


In [17]:
#initialize graphs corresponding to each adjacency matrix of weights
G0 = nx.DiGraph()
G1 = nx.DiGraph()
G7 = nx.DiGraph()
G30 = nx.DiGraph()

In [18]:
def add_nodes(graph):
    graph.add_nodes_from([(i, {"name" : stock}) for i, stock in enumerate(unique)])

In [19]:
def add_edges(graph, edges):
    rows, cols = edges.shape
    for row in range(rows):
        for col in range(cols):
            if row != col: graph.add_edge(row, col, weight=edges[row, col])

In [20]:
def make_graph(graph, edges):
    add_nodes(graph)
    add_edges(graph, edges)

In [21]:
make_graph(G0, weights0)
make_graph(G1, weights1)
make_graph(G7, weights7)
make_graph(G30, weights30)

In [22]:
print(G0.number_of_edges())
print(G0.number_of_nodes())
print(G0.out_degree(0))

254520
505
504


In [24]:
#compute the min and max spanning trees
min_st0 = nx.minimum_spanning_arborescence(G0)
'''min_st1 = nx.minimum_spanning_arborescence(G1)
min_st7 = nx.minimum_spanning_arborescence(G7)
min_st30 = nx.minimum_spanning_arborescence(G30)
max_st0 = nx.maximum_spanning_arborescence(G0)
max_st1 = nx.maximum_spanning_arborescence(G1)
max_st7 = nx.maximum_spanning_arborescence(G7)
max_st30 = nx.maximum_spanning_arborescence(G30)'''

KeyboardInterrupt: 

In [23]:
#finds the {tag} degree node in the graph such that tag is in {max, min}
def get_degree_extrema(graph, tag):
    if tag not in ['max', 'min']: return 'invalid tag'
    node_sums = {}
    for node in graph.nodes:
        edge_weights = graph.edges(node, data='weight')
        sum_of_edges = sum(weight for _, _, weight in edge_weights)
        node_sums[node] = sum_of_edges
    if tag == 'max': return graph.nodes[max(node_sums, key=node_sums.get)]
    else: return graph.nodes[min(node_sums, key=node_sums.get)]

In [24]:
print(f'max degree in G0 is: {get_degree_extrema(G0, "max")}')
print(f'max degree in G1 is: {get_degree_extrema(G1, "max")}')
print(f'max degree in G7 is: {get_degree_extrema(G7, "max")}')
print(f'max degree in G30 is: {get_degree_extrema(G30, "max")}')
print(f'min degree in G0 is: {get_degree_extrema(G0, "min")}')
print(f'min degree in G1 is: {get_degree_extrema(G1, "min")}')
print(f'min degree in G7 is: {get_degree_extrema(G7, "min")}')
print(f'min degree in G30 is: {get_degree_extrema(G30, "min")}')

max degree in G0 is: {'name': 'HON'}
max degree in G1 is: {'name': 'WLTW'}
max degree in G7 is: {'name': 'APTV'}
max degree in G30 is: {'name': 'UA'}
min degree in G0 is: {'name': 'CHD'}
min degree in G1 is: {'name': 'DWDP'}
min degree in G7 is: {'name': 'ABBV'}
min degree in G30 is: {'name': 'ABBV'}


In [25]:
# Use the Louvain community detection algorithm to partition the graph into connecrted communities
def find_communities(graph):
    return nx.community.louvain_communities(graph, seed=123)

In [28]:
# Use the Louvain community detection algorithm to partition the graph into connecrted communities
communities0 = find_communities(G0)
communities1 = find_communities(G1)
communities7 = find_communities(G7)
communities30 = find_communities(G30)

In [46]:
#filter communites to only include multi-node communities (remove communities of single stock)
def filter_communities(community):
    return [c for c in community if len(c) > 1]

In [47]:
communities0 = filter_communities(communities0)
communities1 = filter_communities(communities1)
communities7 = filter_communities(communities7)
communities30 = filter_communities(communities30)

In [49]:
print(len(communities0))
print(len(communities1))
print(len(communities7))
print(len(communities30))

55
2
32
2


In [64]:
def get_community_names(communities, graph):
    res = list()
    for community in communities:
        add = list()
        for stock in community:
            add.append(graph.nodes[stock]['name'])
        res.append(add)
    return res

In [66]:
community_names0 = get_community_names(communities0, G0)
community_names1 = get_community_names(communities1, G1)
community_names7 = get_community_names(communities7, G7)
community_names30 = get_community_names(communities30, G30)

In [73]:
data = community_names0
# Get the sizes of each category
category_sizes = [len(category) for category in data]

# Generate a colormap based on the number of groups
cmap = cm.get_cmap('tab20')

# Create a scatter plot
fig, ax = plt.subplots()

for i, category in enumerate(data):
    x = [i] * len(category)  # X-coordinates (group index)
    y = [1] * len(category)  # Y-coordinates (arbitrary value)
    sizes = [category_sizes[i]] * len(category)  # Bubble sizes
    color = cmap(i / len(data))  # Get a color from the colormap
    
    ax.scatter(x, y, s=sizes, color=color, alpha=0.5, label=f'Group {i+1}')
    for j, name in enumerate(category):
        ax.annotate(name, (x[j], y[j]), ha='center', va='center')

# Customize the plot
ax.set_xticks(range(len(data)))
ax.set_xticklabels(range(1, len(data) + 1))
ax.set_xlabel('Group')
ax.set_title('Category Sizes')

ax.legend()
plt.tight_layout()

# Display the plot
plt.show()

NameError: name 'cm' is not defined