# Tag Network Analysis

As we've seen in this competition, the anonymized features seem to have strong underlying structure.  I thought I'd take a look and try to tease out some of this structure using graph analysis with [Gephi](https://gephi.org/) and [NetworkX](https://networkx.org/).

I encourage you to take a look at Gephi: it's a powerful tool for examining network topologies.

Here's an example:
![Gephi](https://media.giphy.com/media/oxyhw1j0L1GjK8pRYR/giphy.gif)

The features.csv file essentially says that certain features are related to one another in some unknown way.  As noted in the excellent notebook by Andrea Politano [here](https://www.kaggle.com/apolitano20/jane-street-features-hierarchical-clustering/), we can represent these relationships as an adjacency matrix and conduct various analyses on it.

In [None]:
import copy
import numpy as np
import pandas as pd
import datatable as dt
from itertools import permutations
import networkx as nx
!apt install -y graphviz libgraphviz-dev pkg-config
!pip install pygraphviz
#import pygraphviz
import pylab

import plotly.graph_objs as go

In [None]:
train_file = '/kaggle/input/jane-street-market-prediction/train.csv'
tags_file = '/kaggle/input/jane-street-market-prediction/features.csv'
tags_df = pd.read_csv(tags_file, index_col=0)
feat_list = tags_df.index.to_list()


def nx_graph_info(G):
    print(f"radius: {nx.radius(G)}")
    print(f"diameter: {nx.diameter(G)}")
    print(f"eccentricity: {nx.eccentricity(G)}")
    print(f"center: {nx.center(G)}")
    print(f"periphery: {nx.periphery(G)}")
    print(f"density: {nx.density(G)}")


def edge_weights(rows=1_000_000):
    df = dt.fread(train_file).to_pandas()
    corrs = df.iloc[:rows, :].corr().abs()

    # Only interested in correlations between features in the tags list
    corrs = corrs.loc[feat_list, feat_list]
    return corrs


def graph_data(corrs, tags=None):
    tag_list = tags_df.columns.to_list()
    if tags is not None:
        tag_list = tags

    d = pd.DataFrame()
    a = tags_df.unstack()
    a = a[a == True]
    for tag in tag_list:
        perms = list(permutations(a[tag].index.to_list(), 2))
        perms = pd.DataFrame(perms)
        perms['Tag']=tag
        d = pd.concat([d, perms])
    d = d.drop_duplicates().reset_index(drop=True)
    df1 = pd.DataFrame(np.sort(d[[0, 1]], axis=1))
    d = d[~df1.duplicated()].reset_index(drop=True)

    vals = pd.Series(dtype=object)
    for index, row in d.iterrows():
        vals = vals.append(pd.Series(corrs.loc[row[0], row[1]]))
    vals = pd.DataFrame(vals).reset_index(drop=True)

    d = pd.concat([d, vals], axis=1)
    d.columns = ['Source', 'Target', 'Tag', 'Weight']
    d['Type'] = 'Undirected'
    return d


def get_nx_graph(d, corrs, min_corr=.0001):
    G = nx.from_pandas_edgelist(d[d.Weight > min_corr],source='Source',
                                target='Target',edge_attr=['Tag','Weight'])
    G.remove_edges_from(nx.selfloop_edges(G))
    return G

In [None]:
def do_nx_plotly(G,title='',edge_colors_by_tag=False):
    edge_x = []
    edge_y = []
    edge_t = []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_x.extend([x0,x1,None])
        edge_y.extend([y0,y1,None])
        if edge_colors_by_tag:
            tag = G[edge[0]][edge[1]]['Tag']
            edge_t.extend([tag, tag, tag])
    edge_trace = []
    if edge_colors_by_tag:
        edges = list(G.edges())
        tags = set([ G[e[0]][e[1]]['Tag'] for e in edges])
        num_tags = len(tags)
        # Get a new color for each tag
        cm = pylab.get_cmap('magma')
        colors = list((cm(1.*i/num_tags) for i in range(num_tags)))
        # Lighten colors by half
        colors = [ x[:-1] + tuple([.5]) for x in colors]
        colors = ['rgba'+str(l) for l in colors]
        
        # For each tag, make a new set of edges to plot
        for tag_num,tag in enumerate(tags):
            msk = [ t == tag for t in edge_t ]
            x_tag = np.array(edge_x)[msk].tolist()
            y_tag = np.array(edge_y)[msk].tolist()
            edge_trace.append(go.Scatter(x=x_tag, y=y_tag,
                                    line=dict(width=0.5, color=colors[tag_num]),                            
                                    hoverinfo='none', mode='lines'))
    else:
        edge_trace.append(go.Scatter(x=edge_x, y=edge_y,
                                line=dict(width=0.5, color='#888'),                            
                                hoverinfo='none', mode='lines'))
    node_x = []
    node_y = []
    texts = []
    for node in G.nodes():
        x, y = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)
        if isinstance(node, str):
            texts.append(node.replace('feature_','f_'))
    

    node_trace = go.Scatter(x=node_x, y=node_y,mode='markers+text',
                            text=texts,textposition="top left",
                    marker=dict(showscale=True, colorscale='YlGnBu',
                    reversescale=True, color=[],
                    size=10, colorbar=dict(
                        thickness=15, title='Node Connections',
                        xanchor='left', titleside='right'
                    ),line_width=2))
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        f_str = 'f_' + str(node) + ': '
        node_text.append(f_str + '# of connections: '+str(len(adjacencies[1])))
    node_trace.marker.color = node_adjacencies
    fig = go.Figure(
        layout=go.Layout(
            title=title,
            titlefont_size=16, showlegend=False,
            hovermode='closest',
            margin=dict(b=20, l=5, r=5, t=40),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            height=600,
        )
    )
    fig.add_traces(edge_trace)
    fig.add_traces(node_trace)
    fig.show()


In [None]:
# This takes a while because we are calculating correlation coefficients for 1_000_000 rows
print('Calculating correlation coefficients...', end='')
corrs = edge_weights(rows=1_000_000)
print('done.')

equal_weights = pd.DataFrame(np.ones((len(feat_list),len(feat_list))), columns=feat_list)
equal_weights.index = feat_list

## Network Graph Plots
Let's take a simple look at what the graph of these relationships looks like by setting all relationship weights to 1 and using a simple graph layout algorithm. (You can zoom in and examine using plotly's interface).

In [None]:
d = graph_data(equal_weights)
G = get_nx_graph(d, equal_weights)
pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='neato')
for n, p in pos.items():
    G.nodes[n]['pos'] = p

do_nx_plotly(G,title='All Tags With Weights Set To 1')

Ok that shows what we know: there's plenty of structure here.  Let's use the correlation coefficients between all the features as weights for the graph.

In [None]:
d = graph_data(corrs)
G = get_nx_graph(d, corrs)
pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='neato')
for n, p in pos.items():
    G.nodes[n]['pos'] = p
do_nx_plotly(G,title='All Tags with Correlation Coefficients as Edge Weights')

Ok, that was unhelpful.  Let's invalidate edges below a minimum correlation coefficient.

In [None]:
d = graph_data(corrs)
G = get_nx_graph(d, corrs, min_corr = .4)
pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='neato')
for n, p in pos.items():
    G.nodes[n]['pos'] = p
do_nx_plotly(G,title='All Tags with Correlation Coefficients <.4 as Edge Weights')

Interesting. We're starting to seem some clusters break off. I took a look in gephi and it was clear that there are groups. Let's plot these and color the edges by tag:

In [None]:
d = graph_data(corrs, tags=['tag_15', 'tag_6', 'tag_14', 'tag_17', 'tag_20', 'tag_22', 'tag_21'])
G = get_nx_graph(d, corrs)
pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='neato')
for n, p in pos.items():
    G.nodes[n]['pos'] = p
do_nx_plotly(G,title='Tags 6, 14, 15, 17, 20, 21, 22', edge_colors_by_tag=True)

We have 7 clusters here. Tags 6, 14, 15, 17, 20, 21, 22 create [fully connected subgraphs](https://en.wikipedia.org/wiki/Complete_graph).

Interestingly, tags 15 and 17 form distinct fully connected subgraphs, which are almost fully connected by tag 23:

In [None]:
d = graph_data(corrs, tags=['tag_15', 'tag_17', 'tag_23'])
G = get_nx_graph(d, corrs)

pos = nx.fruchterman_reingold_layout(G, k=.5)

for n, p in pos.items():
    G.nodes[n]['pos'] = p
do_nx_plotly(G,title='Tags 15, 17, 23', edge_colors_by_tag=True)

What's going on with features 44, 45, 46, 49, and 50?

* Features 45, 46, and 50 have tag 17 but not tag 23
* Features 49 and 44 have tag 15 but not tag 23

There are also some other distinct subgraphs (Tags 5, 14, 19, 18, 20, 0, 1, 22, 2, 3, and 4):

In [None]:
tag_str = '5, 14, 19, 18, 20, 0, 1, 22, 2, 3, 4'
tags = ['tag_'+str(x) for x in [5,14,19,18,20,0,1,22,2,3,4]]
d = graph_data(corrs, tags=tags)
G = get_nx_graph(d, corrs)

pos = nx.drawing.nx_agraph.graphviz_layout(G, prog='neato')

for n, p in pos.items():
    G.nodes[n]['pos'] = p
do_nx_plotly(G,title='Tags '+ str(tag_str))

## Ideas and Further Research
What does this get us?  These subgraphs potentially help de-anonymize the features.

It may help to run each of these sets of feature groups through a non-linear dimension reduction algorithm in order to derive a cleaner signal. These feature groups represent properties of the underlying data probably scaled over different time and/or modal domains.  For example, tags  15, 17 may represent trading Volume and price Volatilty connected by tag 23, which may represent Time.

By isolating these feature groups and examining how they relate to each other, we can engineer better features.

What do you think?
