In [25]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import mpl

import pickle
import json
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import scipy.sparse as sp
import dgl
import torch
min_max_scaler = preprocessing.MinMaxScaler()

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [26]:
# 种子公司的id定义在seed company.csv中
# 种子公司高低风险的标签在company_feature.csv中
# company_feature.csv中标签0的含义是天眼查中未给出风险定义
path=r".\latest\seed company.json"
seedcompany=json.load(open(path))
len(seedcompany) 

500

In [27]:
seedcomdf=pd.DataFrame(seedcompany,columns=['company_id'])
seedcomdf['company_id']=seedcomdf['company_id'].astype('int64') 
seedcomdf.dtypes

company_id    int64
dtype: object

In [28]:
allcomdf=pd.read_csv('.\latest\company_feature_new.csv')  # include seed company
print(allcomdf.head())
print(len(allcomdf))
print(len(allcomdf.columns))

   company_id  强制清算高风险信息  涉金融黑名单高风险信息  失信被执行人高风险信息  限制消费令高风险信息  被执行人高风险信息   
0  2355808591        0.0          0.0          0.0         0.0        0.0  \
1  2350579476        0.0          0.0          0.0         0.0        0.0   
2  2819169316        0.0          0.0          0.0         0.0        0.0   
3  2417188561        0.0          0.0          0.0         0.0        0.0   
4  5067556280        0.0          0.0          0.0         0.0        0.0   

   终本案件高风险信息  司法协助警示信息  破产案件警示信息  开庭公告警示信息  ...  5000万以上  无注册资本  国资企业  外资企业   
0        0.0       0.0       0.0       0.0  ...        1      0     0     0  \
1        0.0       0.0       0.0       0.0  ...        0      0     0     0   
2        0.0       0.0       0.0       5.0  ...        0      0     0     0   
3        0.0       0.0       0.0       0.0  ...        1      0     0     0   
4        0.0       0.0       0.0       0.0  ...        0      0     0     0   

   合伙企业  独资内资企业  非独资内资企业  其他类型组织    存续时间  label  
0     0     

In [29]:
# build new company id which is continuous
idx_map=dict(zip(allcomdf['company_id'].values,range(len(allcomdf['company_id']))))  
with open('dict_companyIdMapping.txt','w') as f:
    f.write(str(idx_map))  


In [30]:
# save id mapping of seed company
seedcompanyid=list(map(idx_map.get,seedcomdf['company_id'].values))  
np.save('seedcompany.npy',np.array(seedcompanyid))


In [31]:
seedcompanyid[:10]

[7727, 4119, 1548, 281, 16250, 944, 2585, 16429, 9433, 9354]

In [32]:
# encoding labels
allcomdf['label']=allcomdf['label'].map({'0':2, 'middle':2, 'higher':2,'low':0,'high':1}) # labels from tianyancha

allcomdf['label'].unique()

array([1, 0, 2], dtype=int64)

In [33]:
# coloumn normalization
minmax_x = min_max_scaler.fit_transform(allcomdf.iloc[:,1:-1].values) # remove company_id  label
print(minmax_x)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 1.05782160e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 8.88812719e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 8.54536346e-02]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.16671501e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 7.11893901e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 7.12421230e-02]]


In [34]:
# save data
allxx = np.array(minmax_x)
allyy = allcomdf['label'].astype(int).values
labels=allyy

np.save('data_x.npy',allxx)
np.save('data_y.npy',allyy)

In [35]:
# load relational data
path=r".\latest\relations.json"
relations=json.load(open(path))
print(len(relations))

43255


In [36]:
# save the company2company and person2company separately
ccrelation=[]
pcrelation=[]
for item in relations:
    if len(item[0])>10 or len(item[1])>10:
        pcrelation.append([item[0],item[1]]) 
    else:
        ccrelation.append([item[0],item[1]]) 
print(len(pcrelation),len(ccrelation))

29355 13900


In [38]:
# build c2c edges with the relational data
edges = np.array(list(map(idx_map.get,ccrelation.flatten())),dtype=np.int32).reshape(ccrelation.shape)  
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),shape=(labels.shape[0], labels.shape[0]),dtype=np.float32) 

In [29]:
adj

<17975x17975 sparse matrix of type '<class 'numpy.float32'>'
	with 13900 stored elements in COOrdinate format>

In [40]:
relation_cc,relation_pc=[],[]
for item in relations:
    if len(item[0])>13:
        relation_pc.append(item)
    else:
        relation_cc.append(item)

relation3=np.array(relation_pc) # person2company
print('person to company',np.unique(relation3[:,2])) 
print(len(relation_pc))

relation4=np.array(relation_cc) # company2company
print('company to company',np.unique(relation4[:,2])) 
print(len(relation_cc))

person to company ['INVEST_H' 'OWN' 'SERVE']
29355
company to company ['BRANCH' 'INVEST_C' 'OWN_C']
13900


In [41]:
len(np.unique(relation3[:,0])) #   4562 
# len(relation3[:,0]) #   29355

uniquePerson=np.unique(relation3[:,0])
idxPerson_map=dict(zip(uniquePerson,range(len(uniquePerson))))
idxPerson_map

{'1751735745-c1643989583': 0,
 '1752392086-c3104569628': 1,
 '1753493770-c1380775139': 2,
 '1754048324-c736070236': 3,
 '1754145090-c2310307466': 4,
 '1754672828-c2349015974': 5,
 '1754827652-c2314084493': 6,
 '1754833316-c3122040624': 7,
 '1754995036-c728823023': 8,
 '1755325275-c3155994544': 9,
 '1756035022-c3272807092': 10,
 '1756103235-c3233594662': 11,
 '1756297534-c2349088767': 12,
 '1756369910-c12603453': 13,
 '1756489530-c3147269731': 14,
 '1756521094-c18695545': 15,
 '1756521094-c3481272764': 16,
 '1756889649-c504362427': 17,
 '1757109556-c883792163': 18,
 '1757117035-c22369034': 19,
 '1757511354-c2546478966': 20,
 '1757630174-c339565192': 21,
 '1757630174-c856033609': 22,
 '1757870258-c3129270003': 23,
 '1758067992-c33940784': 24,
 '1758172983-c2352944223': 25,
 '1758257938-c1289409911': 26,
 '1758268653-c538348163': 27,
 '1758280844-c2943665158': 28,
 '1758393388-c4080841929': 29,
 '1758400813-c26904393': 30,
 '1758515569-c2962824463': 31,
 '1758539895-c3088922024': 32,
 '17

In [42]:
np.array(relation3[:,1],dtype=np.int64)

array([2322575463,  808239629, 5653163866, ..., 3072377370, 2350715681,
        607677865], dtype=int64)

In [43]:

relationPCPId=list(map(idxPerson_map.get,relation3[:,0])) #
relationPCCId=list(map(idx_map.get,np.array(relation3[:,1],dtype=np.int64))) 

 
relationCCC1Id=list(map(idx_map.get,np.array(relation4[:,0],dtype=np.int64)))
relationCCC2Id=list(map(idx_map.get,np.array(relation4[:,1],dtype=np.int64)))


relation33=list(zip(relationPCPId,relationPCCId,relation3[:,2]))  
print(relation33[:10])

relation44=list(zip(relationCCC1Id,relationCCC2Id,relation4[:,2])) 
print(relation44[:10])


[(1447, 12221, 'INVEST_H'), (2515, 5256, 'INVEST_H'), (1110, 15702, 'INVEST_H'), (2682, 16757, 'OWN'), (3950, 16492, 'SERVE'), (3364, 9439, 'SERVE'), (1541, 3021, 'INVEST_H'), (2225, 2899, 'INVEST_H'), (1192, 9422, 'INVEST_H'), (3109, 3810, 'SERVE')]
[(10796, 9515, 'INVEST_C'), (9542, 8256, 'INVEST_C'), (1480, 5609, 'INVEST_C'), (625, 6129, 'INVEST_C'), (5401, 1932, 'BRANCH'), (17297, 13989, 'BRANCH'), (17318, 11790, 'INVEST_C'), (13115, 12685, 'INVEST_C'), (1897, 8543, 'INVEST_C'), (6817, 8175, 'OWN_C')]


In [44]:
pd.DataFrame(relation33,columns=['pic','cid','relation']).to_csv('relation_person2company.csv',index=False)
pd.DataFrame(relation44,columns=['cid1','cid2','relation']).to_csv('relation_company2company.csv',index=False)


In [45]:
# create ajacent matrix (c2c) for different relation
#  three types of relation: rcc1,rcc2,rcc3  ['BRANCH' 'INVEST_C' 'OWN_C']
rcc1,rcc2,rcc3=[],[],[]
for item in relation44:
    if item[2]=='BRANCH':        
        rcc1.append([item[0],item[1]])
    elif item[2]=='INVEST_C':
        rcc2.append([item[0],item[1]])
    elif item[2]=='OWN_C':
        rcc3.append([item[0],item[1]])

print(len(rcc1))
print(len(rcc2))
print(len(rcc3))


# create ajacent matrix (p2c) for different relation
#  three types of relation: rpc1,rpc2,rpc3  ['INVEST_H' 'OWN' 'SERVE']
rpc1,rpc2,rpc3=[],[],[]
for item in relation33:
    if item[2]=='INVEST_H':        
        rpc1.append([item[0],item[1]])
    elif item[2]=='OWN':
        rpc2.append([item[0],item[1]])
    elif item[2]=='SERVE':
        rpc3.append([item[0],item[1]])

print(len(rpc1))
print(len(rpc2))
print(len(rpc3))


1656
11492
752
8650
6787
13918


In [46]:
rpc1=np.array(rpc1).T
rpc2=np.array(rpc2).T
rpc3=np.array(rpc3).T

In [47]:
rcc1=np.array(rcc1).T
rcc2=np.array(rcc2).T
rcc3=np.array(rcc3).T

In [38]:
tuple((rpc1[0],rpc1[1]))

(array([1447, 2515, 1110, ...,  238, 3736, 3444]),
 array([12221,  5256, 15702, ...,   705,  8813,   919]))

In [48]:

G = dgl.heterograph(
    {
        ("company", "BRANCH", "company"): tuple((rcc1[0],rcc1[1])),
        ("company", "BRANCHed", "company"): tuple((rcc1[1],rcc1[0])),
        ("company", "INVEST_C", "company"): tuple((rcc2[0],rcc2[1])),
        ("company", "INVEST_Ced", "company"): tuple((rcc2[1],rcc2[0])),
        ("company", "OWN_C", "company"): tuple((rcc3[0],rcc3[1])),
        ("company", "OWN_Ced", "company"): tuple((rcc3[1],rcc3[0])),
        ("person", "INVEST_H", "company"): tuple((rpc1[0],rpc1[1])),
        ("company", "INVEST_Hed", "person"):tuple((rpc1[1],rpc1[0])),
        ("person", "OWN", "company"):tuple((rpc2[0],rpc2[1])),
        ("company", "OWNed", "person"):tuple((rpc2[1],rpc2[0])),
        ("person", "SERVE", "company"):tuple((rpc3[0],rpc3[1])),
        ("company", "SERVEed", "person"):tuple((rpc3[1],rpc3[0]))
    }
)
print(G)
dgl.save_graphs('heteroGraph.bin',G) 


Graph(num_nodes={'company': 17975, 'person': 4562},
      num_edges={('company', 'BRANCH', 'company'): 1656, ('company', 'BRANCHed', 'company'): 1656, ('company', 'INVEST_C', 'company'): 11492, ('company', 'INVEST_Ced', 'company'): 11492, ('company', 'INVEST_Hed', 'person'): 8650, ('company', 'OWN_C', 'company'): 752, ('company', 'OWN_Ced', 'company'): 752, ('company', 'OWNed', 'person'): 6787, ('company', 'SERVEed', 'person'): 13918, ('person', 'INVEST_H', 'company'): 8650, ('person', 'OWN', 'company'): 6787, ('person', 'SERVE', 'company'): 13918},
      metagraph=[('company', 'company', 'BRANCH'), ('company', 'company', 'BRANCHed'), ('company', 'company', 'INVEST_C'), ('company', 'company', 'INVEST_Ced'), ('company', 'company', 'OWN_C'), ('company', 'company', 'OWN_Ced'), ('company', 'person', 'INVEST_Hed'), ('company', 'person', 'OWNed'), ('company', 'person', 'SERVEed'), ('person', 'company', 'INVEST_H'), ('person', 'company', 'OWN'), ('person', 'company', 'SERVE')])


In [49]:
G2=dgl.load_graphs("heteroGraph.bin")
print(G2)
G2[0][0].num_nodes('person')

([Graph(num_nodes={'company': 17975, 'person': 4562},
      num_edges={('company', 'BRANCH', 'company'): 1656, ('company', 'BRANCHed', 'company'): 1656, ('company', 'INVEST_C', 'company'): 11492, ('company', 'INVEST_Ced', 'company'): 11492, ('company', 'INVEST_Hed', 'person'): 8650, ('company', 'OWN_C', 'company'): 752, ('company', 'OWN_Ced', 'company'): 752, ('company', 'OWNed', 'person'): 6787, ('company', 'SERVEed', 'person'): 13918, ('person', 'INVEST_H', 'company'): 8650, ('person', 'OWN', 'company'): 6787, ('person', 'SERVE', 'company'): 13918},
      metagraph=[('company', 'company', 'BRANCH'), ('company', 'company', 'BRANCHed'), ('company', 'company', 'INVEST_C'), ('company', 'company', 'INVEST_Ced'), ('company', 'company', 'OWN_C'), ('company', 'company', 'OWN_Ced'), ('company', 'person', 'INVEST_Hed'), ('company', 'person', 'OWNed'), ('company', 'person', 'SERVEed'), ('person', 'company', 'INVEST_H'), ('person', 'company', 'OWN'), ('person', 'company', 'SERVE')])], {})


4562

In [24]:
# make sure different models use the same dataset partition
perm=np.random.permutation(500)
train_idx=perm[:400]
test_idx=perm[400:]
np.savez('train_test_split.npz',train_idx=train_idx,test_idx=test_idx)