We can see that there are some kind of groups in features, because some columns have missing values at the some time.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm
from itertools import combinations
import networkx as nx
import gc
import plotly.graph_objects as go

!pip install datatable > /dev/null
import datatable as dt

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
sns.set()


dt_train_data = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv')
train = dt_train_data.to_pandas()

del dt_train_data
gc.collect()

train = train.astype({col: np.float32 for col in train.select_dtypes(include='float64').columns})
feature_cols = [col for col in train.columns if "feature" in col]

- cup_ij : number of rows having a missing value in ***column i*** or ***column j***
- cap_ij : number of rows having a missing value in ***column i*** and ***column j***

Then, ***missing value share rate*** between ***column i*** and ***column j*** is cap_ij / cup_i.

In [None]:
def calc_missing_value_share_rate(is_nan_df):
    share = list()
    for f_i in is_nan_df.columns:
        row = list()
        for f_j in is_nan_df.columns:
            cap = np.logical_and(is_nan_df[f_i].values, is_nan_df[f_j].values).sum()
            cup = (~np.logical_and((~is_nan_df[f_i].values), (~is_nan_df[f_j].values))).sum()
            if cup > 0:
                row.append(cap / cup)
            else:
                row.append(0)
        share.append(row)
    mv_sr_df = pd.DataFrame(share, index=is_nan_df.columns, columns=is_nan_df.columns)
    return mv_sr_df


def plot_share_rate_hist(mv_sr_df):
    index = mv_sr_df.index
    sr = list()
    for i in range(len(index)):
        for j in range(i + 1, len(index)):
            sr.append(mv_sr_df.loc[index[i], index[j]])
    fig, axs = plt.subplots(1, 2, figsize=(15, 6))
    for i in range(2):
        axs[i].hist(sr, bins=100)
        axs[i].set_ylabel('number')
        axs[i].set_xlabel('missing value share rate')
    axs[1].set_xlim((0.5, 1.0))
    axs[1].set_ylim(0, 400)
    plt.show()


def find_high_share_rate_cluster(mv_sr_df, thr):
    index = mv_sr_df.index
    edges = list()
    for i in range(len(index)):
        for j in range(i + 1, len(index)):
            sr = mv_sr_df.loc[index[i], index[j]]
            if sr >= thr:
                edges.append((index[i], index[j]))
                
    G = nx.Graph()
    G.add_edges_from(edges)
    hsr_cluster = list()
    for cmp_graph in nx.find_cliques(G):
        hsr_cluster.append(cmp_graph)
    return hsr_cluster

In [None]:
mv_sr_df = calc_missing_value_share_rate(train[feature_cols[:-2]].isnull())
mv_sr_df.style.background_gradient(cmap='Blues')

We can use graph representation to find feature groups with highly shared missing values.
- node: feature
- edge: pair of features that share missing values ('missing value share rate' >= 'threshold')

Features in a group create a complete graph in which every nodes is connected by edges.

â€» Note that loops and multiple edges are ignored

In [None]:
grp = find_high_share_rate_cluster(mv_sr_df, 0.9)

- example

In [None]:
i = 5
print(grp[i])
mv_sr_df.loc[grp[i], grp[i]].style.background_gradient(cmap='Blues', vmax=1, vmin=0)

In [None]:
def plot_correlation_graph(groups, n_size=3, figsize=(1500, 1000), k=0.3):
    graph = nx.Graph()
   
    for grp in groups:
        grp_nodes = list()
        for f_no in [int(f.replace('feature_', '')) for f in grp]:
            graph.add_node(f_no)
            grp_nodes.append(f_no)
        for p in combinations(grp_nodes, 2):
            graph.add_edge(p[0], p[1])
    pos = nx.spring_layout(graph, k=k, seed=1)
    for node in graph.nodes():
        graph.nodes[node]["pos"] = pos[node]
        
    node_x = []
    node_y = []
    text = []
    for n in graph.nodes():
        x, y = graph.nodes[n]["pos"]
        node_x.append(x)
        node_y.append(y)
        text.append(f"feature {n}")

    nodes = go.Scatter(
        x=node_x,
        y=node_y,
        mode="markers+text",
        marker=dict(size=n_size, line=dict(width=2)),
        text=text,
        textposition="top center",
        hoverinfo='none'
    )

    edge_x = []
    edge_y = []
    for e in graph.edges():
        x0, y0 = graph.nodes[e[0]]["pos"]
        x1, y1 = graph.nodes[e[1]]["pos"]
        edge_x.append(x0)
        edge_y.append(y0)
        edge_x.append(x1)
        edge_y.append(y1)
        edge_x.append(None)
        edge_y.append(None)

    edges = go.Scatter(
        x=edge_x,
        y=edge_y,
        mode="lines",
        line=dict(width=2),
    ) 

    fig = go.Figure(
        data=[edges, nodes],
        layout=go.Layout(
            showlegend=False,
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            plot_bgcolor="rgba(0, 0, 0, 0)",
            paper_bgcolor="rgba(0, 0, 0, 0)",
            width=figsize[0],
            height=figsize[1]
        ),
    )
    fig.show()

In [None]:
plot_correlation_graph(
    grp,
    n_size=3,
    figsize=(1500, 1000),
    k=0.5
)

Thank you for watching!!