# DATATON 2019

## Import Data

In [1]:
# Librerias
import numpy as np
import pandas as pd
import networkx as nx

In [6]:
# Datos extraidos del PND y preprocesados en R
B = pd.read_csv("./Data/buyers.csv")
TS = pd.read_csv("./Data/tenderers_suppliers.csv")
CP = pd.read_csv("./Data/cp.csv")

# Nodos sancionados
san = np.genfromtxt("./Data/nodos_san.txt", dtype="unicode", delimiter="\n")
nodos_san = N[N["id"].isin(san)]['nodo'].values

In [6]:
# Red completa multicapa
G = nx.read_graphml("base_S6_hacky.graphml")

# Red dirigida monocapa
g = nx.Graph(G)

# Listas de propiedades de nodos y aristas
N = pd.read_csv("./Grafo/tabla_nodos.psv",delimiter='|')
E = pd.read_csv("./Grafo/tabla_edges.csv")

In [7]:
# Tipos de nodos:
#    Buyer, CP, Party

rols = N['labels'].unique()
N['Rol']=0
N.loc[ N['labels']==rols[0],'Rol' ]=0
N.loc[ N['labels']==rols[1],'Rol' ]=1
N.loc[ N['labels']==rols[2],'Rol' ]=2

In [8]:
# Tipos de arista:
#    Dependencia - Contrato
#    Contrato - Supplier/Tender

roles = E['roles'].unique()
E['Role']=0
E.loc[ E['roles']==roles[0],'Role' ]=0
E.loc[ E['roles']==roles[1],'Role' ]=1
E.loc[ E['roles']==roles[2],'Role' ]=2

In [9]:
# Aristas de suppliers/tenders

supp = E[E['Role']!=0].source.unique()

In [28]:
# Proyeccion de red de suppliers 

from networkx.algorithms import bipartite

Gs = bipartite.weighted_projected_graph(g,supp)

In [30]:
# Funciones auxiliares para construir DataFrames

def get_attr(x, attr="labels"):
    """Regresa el valor de attr o None.
    
    x: dict, diccionario con los atributos en llave.
    attr: str, nombre del atributo
    """
    
    if attr in x.keys():
        return x[attr]
    else:
        return None

def n_list(x):
    """Regresa en número de elementos en x
    
    x: str, string representando a una lista
    """
    
    if x is None:
        return None
    else:
        x = eval(x)
        return(len(x))

In [125]:
# Cargar dataframe
df_nodes = pd.DataFrame(Gs.nodes(data=True), columns=["nodo", "attrs"])

# Enlistar todos los atributos en el dataframe
key_nodes = np.array([list(row["attrs"].keys()) for i, row in df_nodes.iterrows()])

key_nodes = [x for y in key_nodes for x in y]

key_nodes = set(key_nodes)

In [126]:
print(f"Los atributos disponibles son:\n {key_nodes}")

Los atributos disponibles son:
 {'labels', 'name', 'id'}


In [127]:
# Crear una columna por atributo disponible y
# al final borrar la columna de atributos
for attr in key_nodes:
    df_nodes[attr] = df_nodes["attrs"].apply(get_attr, attr=attr)

df_nodes = df_nodes.drop("attrs", axis=1)

In [71]:
# Calculamos grados de nodos

degrees = pd.DataFrame(list(Gs.degree( df_nodes.nodo.unique())))
degrees.to_csv("degrees_supp.csv", index=False)

In [131]:
# Dataframe con caracteristicas de nodos y grados

df_node_d = pd.merge(df_nodes,degrees,how="left",left_on="nodo",right_on=0).drop(columns=0)

df_node_d = df_node_d.rename(columns={1:'degree'})

### Write out

In [132]:
# Guardar la tabla
df_nodes.to_csv("nodos_supp.csv", index=False)

# Aristas

Se crea una tabla con, para cada arista, los valores de cada atributo.

In [37]:
# Cargar dataframe
df_edges = pd.DataFrame(Gs.edges(data=True), columns=["source", "target", "attrs"])
# Enlistar todos los atributos  de las aristas en el dataframe
key_edges = np.array([list(row["attrs"].keys()) for i, row in df_edges.iterrows()])
key_edges = [x for y in key_edges for x in y]
key_edges = set(key_edges)

In [40]:
for attr in key_edges:
    df_edges[attr] = df_edges["attrs"].apply(get_attr, attr=attr)

df_edges = df_edges.drop("attrs", axis=1)

# Red inducida de los sancionados en la proyeccion

In [96]:
# Red inducida por los nodos sancionados
G_san = Gs.subgraph(nodos_san)

In [308]:
# Funcion para medir features de red

def measure(G):
    """
    Medimos:
    -Numero de componentes conexos
    -Tamaño de maximo componente conexo
    -Grado promedio, desviacion y maximo
    -Coeficiente de clustering
    -Grado de vecinos promedio
    -Conectividad promedio
    -Coeficiente de asortatividad
    """
    comps = [ c for c in nx.connected_components(G)]
    CC = len(comps)
    CC_max = max([ len(c) for c in comps])

    deg = [d for v, d in G.degree]

    deg_mean = np.mean(deg)
    deg_std = np.std(deg)
    deg_max = max(deg)

    clust = nx.average_clustering(G)
    neig = np.mean(list(nx.average_neighbor_degree(G).values()))
    conn = np.mean(list(nx.average_degree_connectivity(G).values()))

    assort = nx.assortativity.degree_assortativity_coefficient(G)
    
    return( [CC,CC_max,deg_mean,deg_std,deg_max,clust,neig,conn,assort] )

In [246]:
# Extraccion del componente mas grande de sancionados

comps=[ g for g in nx.connected_component_subgraphs(G_san)]
pd.DataFrame(comps[0].nodes).to_csv("component.csv", index=False)

# Comparacion con modelos nulos

In [None]:
# Escogemos nodos aleatoriamente, extremos la red inducida
# y medimos los mismos parametros que la red de sancionados

nodos_totales = list(Gs.nodes)

n = len(G_san)

temp = measure(G_san)
temp.append(n)
data = [temp]
for i in range(100):
    nodos_temp = np.random.choice(nodos_totales,n)
    G_temp = Gs.subgraph(nodos_temp)
    temp = measure(G_temp)
    temp.append(sum([ nod in list(G_san.nodes) for nod in list(G_temp.nodes)]))
    data.append(temp )

### Write out

In [334]:
params=["CC","CC_max","deg_mean","deg_std","deg_max","clust","neig","conn","assort","#"]
df_ran = pd.DataFrame(data, columns=["CC","CC_max","deg_mean","deg_std","deg_max","clust","neig","conn","assort","#"])
df_ran = df_ran.sort_values(by=['#'],ascending=False)

df_ran.to_csv("nulls.csv", index=False)

In [338]:
# Medimos los z-scores contra modelos nulos

data = []
for i in range(len(params)):
    data.append( list((df_ran[params[i]]-np.mean(df_ran[params[i]]))/np.std(df_ran[params[i]])) )

df_z = pd.DataFrame(data)
df_z = df_z.T
df_z.columns=["CC","CC_max","deg_mean","deg_std","deg_max","clust","neig","conn","assort","#"]

df_z.to_csv("z_nulls.csv", index=False)