# Short analysis of the molecule data

Sébastien MEYER

In [None]:
import pickle

import numpy as np

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

# Distribution plots

In [None]:
graph_list_train = pickle.load(open("data/training_data.pkl", "rb"))
graph_list_test = pickle.load(open("data/test_data.pkl", "rb"))
    
graph_lists = [graph_list_train, graph_list_test]
graph_names = ["train", "test"]    

for graph_list, graph_name in zip(graph_lists, graph_names):

    print(f"There are {len(graph_list)} graphs in the {graph_name} set.")

In [None]:
for graph_list, graph_name in zip(graph_lists, graph_names):

    nb_nodes_list = [graph.number_of_nodes() for graph in graph_list]

    mean_nb_nodes = np.mean(nb_nodes_list)
    min_nb_nodes = np.min(nb_nodes_list)
    max_nb_nodes = np.max(nb_nodes_list)

    fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=True)

    sns.histplot(nb_nodes_list, stat="density", bins=50, fill=True, alpha=0.6, ax=ax)

    sns.kdeplot(nb_nodes_list, color="black", ax=ax)

    ax.axvline(mean_nb_nodes, color="red")
    ax.text(mean_nb_nodes+1, 0.05, f"mean: {mean_nb_nodes:.2f}")

    ax.set_xlabel("Number of nodes (atoms)")
    ax.set_title(f"Number of nodes in {graph_name} set")

    print(f"Minimal number of nodes is {min_nb_nodes} in {graph_name} set")
    print(f"Maximal number of nodes is {max_nb_nodes} in {graph_name} set")    
    
    plt.show()

In [None]:
for graph_list, graph_name in zip(graph_lists, graph_names):

    atoms_list = [graph.nodes[i]["labels"][0] for graph in graph_list for i in range(graph.number_of_nodes())]

    unique_atoms, atoms_counts = np.unique(atoms_list, return_counts=True)
    nb_unique_atoms = len(unique_atoms)
    min_atom = np.min(unique_atoms)
    max_atom = np.max(unique_atoms)

    atoms_dist = {atom: atom_count for atom, atom_count in zip(unique_atoms, atoms_counts)}
    atoms_dist = {k: v for k, v in sorted(atoms_dist.items(), key=lambda item: item[1], reverse=True)}

    print(f"There are {nb_unique_atoms} unique atoms in {graph_name} set ranging from {min_atom} to {max_atom}")
    print(f"Their distribution for {graph_name} set is {atoms_dist}")

In [None]:
for graph_list, graph_name in zip(graph_lists, graph_names):

    nb_edges_list = [graph.number_of_edges() for graph in graph_list]

    mean_nb_edges = np.mean(nb_edges_list)
    min_nb_edges = np.min(nb_edges_list)
    max_nb_edges = np.max(nb_edges_list)
    
    fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=True)

    sns.histplot(nb_edges_list, stat="density", bins=50, fill=True, alpha=0.6, ax=ax)

    sns.kdeplot(nb_edges_list, color="black", ax=ax)

    ax.axvline(mean_nb_edges, color="red")
    ax.text(mean_nb_edges+1, 0.04, f"mean: {mean_nb_edges:.2f}")

    ax.set_xlabel("Number of edges (atoms)")
    ax.set_title(f"Number of edges in {graph_name} set")

    print(f"Minimal number of edges is {min_nb_edges} in {graph_name} set")
    print(f"Maximal number of edges is {max_nb_edges} in {graph_name} set")   
    
    plt.show()

In [None]:
edges_list = [graph_list[42].edges[e]["labels"][0] for e in graph_list[42].edges()]

In [None]:
for graph_list, graph_name in zip(graph_lists, graph_names):

    edges_list = np.array([graph.number_of_edges() for graph in graph_list])

    nb_graph_with_no_edge = np.sum(edges_list <= 0)
    
    print(f"There are {nb_graph_with_no_edge} molecules without edges in {graph_name} set")

In [None]:
for graph_list, graph_name in zip(graph_lists, graph_names):
    
    connected_list = np.array([nx.is_connected(graph) for graph in graph_list])
    
    nb_unco_graphs = np.sum(connected_list)
    
    print(f"There are {nb_unco_graphs} unconnected graphs in {graph_name} set")

In [None]:
y_train = pickle.load(open("data/training_labels.pkl", "rb"))
print(np.unique(y_train, return_counts=True))

---