<h1>Prove the validity of community in graph G_3 in active_plaintiff_patterns</h1>
<p>设计随机试验：
	现在的graph其plaintiff的连接是基于其是否有合作过同一个律师而决定的，这样连下来的图经过community detection之后得到了几个plaintiff的cluster，我们可以先算一下目前数据里平均一个plaintiff要连接多少个lawyer，然后整理出lawyer的集合，让plaintiff随机(或者看看plaintiff与lawyer有没有明显的地域倾向，连接时可以挑选同一地方的lawyer)和集合中的lawyer做连接，然后重新计算出plaintiff的连接图并做community detection（确保community数量与要验证的plaintiff cluster数量一致），计算cluster的modularity和conductance，重复1000次，得到上述两个指标的均值和方差（此过程模拟了随机情况下plaintiff 和lawyer的连接），比较待验证cluster和随机过程得到的上述两种指标（add:搞清楚graph cart是什么），如果待验证的plaintiff cluster明显指标更好，就可以说明其存在的有效性</p>

In [80]:
import pandas as pd
import networkx as nx
import numpy as np
import random
import scipy.stats as stats
import itertools
import community as community_louvain
from sklearn.cluster import KMeans
import scipy.linalg as linalg
import sklearn.preprocessing
import scipy.sparse as sparse
import networkx.algorithms.community as nx_comm
from timeit import default_timer as timer

In [2]:
# comparative graph, has 7 communities
G_3 = nx.read_gexf("/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/networks/G_3.gexf")

In [10]:
all_cases = pd.read_csv('/Users/starice/OwnFiles/cityu/RA/case_study/data/total_extracted_result/all_cases.csv', encoding="utf-8")
# print(all_cases['case_id'].drop_duplicates())
all_cases = all_cases[all_cases['defendant'] != all_cases['lawyer']]
all_cases = all_cases[all_cases['lawyer']!="共同委托人"]

<h3>平均一个活跃plaintiff连接多少个lawyer</h3>
<p>plaintiff set: fps_200; lawyer_set: lawyers</p>

In [4]:
#获取所有一审案件
first_cases = all_cases[all_cases['procedure']=="一审"]
# print("一审案件数量： ", len(first_cases['case_id'].drop_duplicates()))

# 获取所有案件原告的节点度并排序
degree_1stplaintiffs = first_cases.groupby("plaintiff")['case_id'].unique().reset_index()
degree_1stplaintiffs['case_count'] = degree_1stplaintiffs['case_id'].apply(lambda r: len(r))
degree_1stplaintiffs.sort_values(by="case_count", inplace=True, ascending=False)
fps_200 = degree_1stplaintiffs[:200]
# fps_200
new_selected_1stcp = first_cases[first_cases['plaintiff'].isin(fps_200['plaintiff'])]

In [5]:
len(new_selected_1stcp['case_id'].drop_duplicates())

16070

In [6]:
temp = new_selected_1stcp.groupby("plaintiff")['lawyer'].nunique().reset_index()
print(temp['lawyer'].mean(), temp['lawyer'].std())
# 平均每个plaintiff和8个不同的lawyer合作

7.375 6.0678222894777845


In [7]:
lawyers = list(new_selected_1stcp['lawyer'].drop_duplicates())
plaintiffs = fps_200['plaintiff']

In [8]:
temp['lawyer'].describe()

count    200.000000
mean       7.375000
std        6.067822
min        0.000000
25%        3.000000
50%        6.000000
75%       11.000000
max       34.000000
Name: lawyer, dtype: float64

<h3>随机生成plaintiff和lawyer的连接</h3>

In [70]:
pdpl = pd.DataFrame(columns=["plaintiff", 'lawyer'])
for i in plaintiffs:
    tlawyers = random.sample(lawyers, 8)
    for j in tlawyers:
        pdpl = pdpl.append({"plaintiff": i, "lawyer": j}, ignore_index=True)

In [71]:
law_pdpl = pdpl.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()

In [72]:
law_pdpl.head()

Unnamed: 0,lawyer,plaintiff
0,丁刘,"(刘庆生, 贾伟)"
1,丁长富,"(何林松,)"
2,万方,"(张翼,)"
3,万红平,"(李娟,)"
4,万迎军,"(杜渺, 郭勇, 况力彬)"


In [73]:
G = nx.Graph()
G.add_nodes_from(plaintiffs)
for i in range(1, len(law_pdpl)+1):
    a = list(law_pdpl[i-1:i]['plaintiff'].values[0])
    if len(a) > 0:
        G.add_edges_from(list(itertools.combinations(a, 2)))

In [74]:
def randomGenerateGraph(): # 重新加地域信息（市）
    pdpl = pd.DataFrame(columns=["plaintiff", 'lawyer'])
    for i in plaintiffs:
        tlawyers = random.sample(lawyers, 8)
        for j in tlawyers:
            pdpl = pdpl.append({"plaintiff": i, "lawyer": j}, ignore_index=True)
    law_pdpl = pdpl.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
    G= nx.Graph()
    G.add_nodes_from(plaintiffs)
    for i in range(1, len(law_pdpl)+1):
        a = list(law_pdpl[i-1:i]['plaintiff'].values[0])
        if len(a) > 0:
            G.add_edges_from(list(itertools.combinations(a, 2)))
    return G

In [75]:
# 看看有多少节点在G_3中不连通
# print('connected_components of graph: ',list(nx.connected_components(G_3))[0], len(list(nx.connected_components(G_3))[0]))#162 nodes
# print(G_3.nodes(), len(G_3.nodes()))#200 nodes

<h3>Spectral Clustering</h3>

In [76]:
# Spectral Clustering
# 补充提取律师的律所信息（先不做）
'''
https://www3.nd.edu/~kogge/courses/cse60742-Fall2018/Public/StudentWork/KernelPaperFinal/SCD-Sikdar-final.pdf
'''
#----------------------------------------------------------------------

def k_way_spectral(G, k):
    
    #去掉不连通的节点
    connected_nodes = list(list(nx.connected_components(G))[0])
    connected_graph = G.subgraph(connected_nodes)
    
    #再算一下随机连接的图里面最大连通的节点数有多少（可以再次证明abnormal）
    
    assert nx.is_connected(connected_graph), "the graph must be connnected"
    clusters = []
    if connected_graph.order() < k:
        clusters = list(connected_graph.nodes())
    else:
        L = nx.laplacian_matrix(connected_graph)
        # compute the first k + 1 eigenvectors
        _, eigenvecs = sparse.linalg.eigs(L.asfptype(), k=k+1, which='SM')
        eigenvecs = eigenvecs.real
        # discard the first trivial eigenvector
        eigenvecs = eigenvecs[:, 1:]
        # normalize each row by its L2 norm
        eigenvecs = sklearn.preprocessing.normalize(eigenvecs)
        # run K-means
        kmeans = KMeans(n_clusters=k).fit(eigenvecs)
        cluster_labels = kmeans.labels_
        clusters = [[] for _ in range(max(cluster_labels) + 1)]
        for node_id, cluster_id in zip(connected_graph.nodes(), cluster_labels):
            clusters[cluster_id].append(node_id)
    return clusters, connected_graph

In [77]:
# clusters
# Compute mean modularity and mean conductance for G_3(also std in addition)

def result_output(clusters, G):
    set_clusters = []
    for i in clusters:
        newG = G.subgraph(i)
        set_clusters.append(set(i))
#     print(set_clusters)
    conduct = np.mean([nx.conductance(G, cluster_i) for cluster_i in set_clusters])
    modula = nx_comm.modularity(G, set_clusters)
#     print(modula, conduct)
    return modula, conduct

In [90]:
#求G_3 community的modularity和conductance

clusters, connecG = k_way_spectral(G_3, 7)
a, b = result_output(clusters, connecG)
a, b

(0.5375050039655025, 0.19706497008904592)

In [81]:
# 生成随机图 -> 按照指定数量求partition -> 求分隔后整体社区的modularity和conductance -> 累计运行1000次和原有G_3做比较

start = timer()

modulas, conducs = [], [] #存起来
for i in range(1000):
    G = randomGenerateGraph()
    clusters, connecG = k_way_spectral(G, 7)
    a, b = result_output(clusters, connecG)
    modulas.append(a)
    conducs.append(b)
    
end = timer()
print("------time used = " + str(end - start) + " s")

print(np.mean(modulas), np.std(modulas), np.mean(conducs), np.std(conducs)) # 补充std

------time used = 1974.2248609490016 s
0.23068602460601745 0.017006714840932222 0.6264357045498951 0.01654524394600784


<h3>按照地理位置（具体到市）重新生成plaintiff和lawyer的链接</h3>

In [63]:
new_selected_1stcp
temp_ns1stcp = new_selected_1stcp[new_selected_1stcp['lawyer'].notna()]
city_lawyer = temp_ns1stcp.groupby(['city'])['lawyer'].apply(set).to_dict()
city_plaintiff = temp_ns1stcp.groupby(['plaintiff'])['city'].apply(set).to_dict()

In [64]:
np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], 8, False).tolist()

[9, 1, 2, 6, 3, 5, 8, 7]

In [65]:
def randGenGraphwithinLocation(): # 重新加地域信息（市）
    pdpl = pd.DataFrame(columns=["plaintiff", 'lawyer'])
    tlawyer = []
    for p in list(city_plaintiff.keys()):
        for c in city_plaintiff[p]:
            if c in city_lawyer.keys():
                tlawyer += list(city_lawyer[c])
        if len(tlawyer) > 0:
            lawyers = np.random.choice(tlawyer, 8, True).tolist() if len(tlawyer) < 8 else np.random.choice(tlawyer, 8, False).tolist()
            for j in lawyers:
                pdpl = pdpl.append({"plaintiff": p, "lawyer": j}, ignore_index=True)
    law_pdpl = pdpl.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
    G = nx.Graph()
    G.add_nodes_from(plaintiffs)
    for i in range(1, len(law_pdpl)+1):
        a = list(law_pdpl[i-1:i]['plaintiff'].values[0])
        if len(a) > 0:
            G.add_edges_from(list(itertools.combinations(a, 2)))
    return G

In [82]:
# 生成随机图 -> 按照指定数量求partition -> 求分隔后整体社区的modularity和conductance -> 累计运行1000次和原有G_3做比较

start = timer()

modulas, conducs = [], [] #存起来
for i in range(1000):
    G = randGenGraphwithinLocation()
    clusters, connecG = k_way_spectral(G, 7)
    a, b = result_output(clusters, connecG)
    modulas.append(a)
    conducs.append(b)

end = timer()
print("------time used = " + str(end - start) + " s")
print(np.mean(modulas), np.std(modulas), np.mean(conducs), np.std(conducs)) # 补充std

------time used = 2163.4393698250024 s
0.1674507166579698 0.0163721276292351 0.6899982640587417 0.015281769101389692


<h2>Addition: Analyse the G_3 Community</h2>

In [94]:
partition_3 = community_louvain.best_partition(G_3)
partitions_3 = []
for i in range(50): #看一下前十个community
    partitions_3.append({k for k, v in partition_3.items() if v==i})

In [95]:
len(G_3.edges())

1412

<h4>统计cluster里律师的相关案件次数排序， 选择每个cluster内较高的律师，重新选择由高频律师主导的数据并且画图，比较前后数据集构成的community内的边差多少</h4>

In [97]:
partition_list = []
marginal_list = []
for i in partitions_3:
    if len(i) > 1: partition_list.append(list(i))
    if len(i) == 1: marginal_list.append(list(i)[0])

In [98]:
partition_list#louvain

[['薛洋',
  '张琦',
  '赵磊',
  '吴保利',
  '王旭',
  '白世桥',
  '李双琴',
  '薛文英',
  '王飞',
  '梁铭洲',
  '许智禄',
  '郑细海',
  '王少勇',
  '吕芝培',
  '徐桂锦',
  '战伟东',
  '刘阳',
  '刘占奎',
  '李政',
  '刘洋',
  '胡祥年',
  '丛李松',
  '孙梦达',
  '马超',
  '李洪岩',
  '徐向新',
  '杨照',
  '郑建芳',
  '矫仁辉',
  '周悟权',
  '刘俊梅',
  '李亮',
  '沈凯',
  '张波',
  '刘锐哲',
  '申元生'],
 ['孙桂兰',
  '周开礼',
  '邓德波',
  '常晓恒',
  '罗伟',
  '林海东',
  '王堂飞',
  '郁德专',
  '朱志鹏',
  '张佩',
  '贾龙',
  '胡玉宝',
  '晏勇',
  '贾伟',
  '彭雪莲',
  '罗生华',
  '叶润军',
  '沈亮',
  '蔡兴杨',
  '徐忠',
  '秦东',
  '李玉婵',
  '余定勇',
  '李军',
  '李娟',
  '余啟红',
  '何林松',
  '姚金东',
  '贾涛',
  '尹前林',
  '况力彬',
  '李战江',
  '崔毅',
  '胡鎏亮',
  '张伟',
  '强大应',
  '刘旺',
  '王伟华',
  '张望成',
  '彭海波',
  '张园',
  '徐水江',
  '李季洪',
  '覃玉东',
  '刘庆生',
  '程祥',
  '陈亚平',
  '李波',
  '张翼',
  '胡健',
  '杜文江',
  '胡勇',
  '王书培',
  '殷庆'],
 ['于福利',
  '陶然',
  '王威',
  '刘树文',
  '张宏海',
  '杨秀欣',
  '田秋生',
  '王志财',
  '李桐丞',
  '李志伟',
  '王瑜',
  '李家亨',
  '赵铁亮',
  '魏世录'],
 ['张业君',
  '向云福',
  '陶梓欣',
  '李剑林',
  '魏胜武',
  '陈君',
  '王新建',
  '高清燕',
  '刘宏伟',
  '吕学鹏',
  '田晓晓',


In [99]:
clusters#spectral clustering

[['白世桥',
  '沈凯',
  '申元生',
  '周悟权',
  '张波',
  '李政',
  '丛李松',
  '郑建芳',
  '战伟东',
  '郑细海',
  '刘洋',
  '徐桂锦',
  '刘占奎',
  '杨照',
  '赵磊',
  '吕芝培',
  '梁铭洲',
  '李亮',
  '李双琴',
  '胡祥年',
  '王旭',
  '许智禄',
  '张琦',
  '刘阳',
  '马超',
  '王少勇',
  '李洪岩',
  '孙梦达',
  '徐向新',
  '刘锐哲',
  '吴保利',
  '矫仁辉'],
 ['张正荣',
  '秦东',
  '况力彬',
  '晏勇',
  '蒋飞亮',
  '俞光新',
  '强大应',
  '周开礼',
  '谢志桂',
  '叶润军',
  '胡鎏亮',
  '李军',
  '余定勇',
  '覃玉东',
  '胡玉宝',
  '贾龙',
  '朱志鹏',
  '刘庆生',
  '王堂飞',
  '王伟华',
  '李玉婵',
  '李海林',
  '姚金东',
  '李战江',
  '李俊华',
  '阳秋旺',
  '尹前林',
  '张伟',
  '徐忠',
  '胡健',
  '崔毅',
  '邓德波',
  '王飞',
  '谢志胜',
  '许承凯',
  '徐水江',
  '郁德专',
  '张翼',
  '常晓恒',
  '贾伟',
  '陈亚平',
  '张望成',
  '贾涛',
  '张佩',
  '李娟',
  '林海东',
  '何林松',
  '杜文江',
  '胡勇',
  '殷庆',
  '彭海波',
  '李波',
  '程祥',
  '罗伟',
  '余啟红',
  '李季洪',
  '蔡兴杨',
  '彭雪莲',
  '孙桂兰',
  '罗生华',
  '王书培',
  '张园',
  '沈亮',
  '刘旺'],
 ['杨秀欣',
  '王玥明',
  '陶然',
  '魏胜武',
  '孟凡野',
  '魏世录',
  '张宏海',
  '田秋生',
  '于福利',
  '李家亨',
  '李志伟',
  '郭福顺',
  '王志财',
  '简晓华',
  '冯德林',
  '李桐丞',
  '田晓晓',
  '简晓祥',
  '赵铁亮

<h4>Compare two community groups result from louvain and spectral clustering</h4>

In [104]:
df = pd.DataFrame(columns=["group", "provinces", "success_rate", "object_money", "penalty"])
for i in partition_list:
#     print(i)
    temp_ns1stcp = first_cases[first_cases['plaintiff'].isin(i)]
#     print("province of this group is: ", temp_ns1stcp['province'].unique())
    temp_ns1stcp_cc = len(temp_ns1stcp['case_id'].unique())
    temp_ns1stcp_sc = len(temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['case_id'].unique())
#     print("success_rate of this group is: ", temp_ns1stcp_sc/temp_ns1stcp_cc, "\n")
    df = df.append({"group":i, "provinces":temp_ns1stcp['province'].unique(), 
                    "success_rate": temp_ns1stcp_sc/temp_ns1stcp_cc, 
                    "object_money": temp_ns1stcp['objectmoney'].mean(), 
                    "penalty": temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['penalty'].mean()}, 
                   ignore_index=True)

In [105]:
# add marginal nodes into df as a group
# print(marginal_list)
temp_ns1stcp = first_cases[first_cases['plaintiff'].isin(marginal_list)]
# print("province of this group is: ", temp_ns1stcp['province'].unique())
temp_ns1stcp_cc = len(temp_ns1stcp['case_id'].unique())
temp_ns1stcp_sc = len(temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['case_id'].unique())
# print("success_rate of this group is: ", temp_ns1stcp_sc/temp_ns1stcp_cc, "\n")
df = df.append({"group":marginal_list, 
                "provinces": temp_ns1stcp['province'].unique(), 
                "success_rate": temp_ns1stcp_sc/temp_ns1stcp_cc, 
                "object_money": temp_ns1stcp['objectmoney'].mean(), 
                "penalty": temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['penalty'].mean()}, 
               ignore_index=True)

In [106]:
display(df)

Unnamed: 0,group,provinces,success_rate,object_money,penalty
0,"[薛洋, 张琦, 赵磊, 吴保利, 王旭, 白世桥, 李双琴, 薛文英, 王飞, 梁铭洲, ...","[浙江省, 北京市, 广东省, 上海市, 河南省, 黑龙江省, 天津市, 山东省, 重庆市,...",0.946607,789.412832,6389.558996
1,"[孙桂兰, 周开礼, 邓德波, 常晓恒, 罗伟, 林海东, 王堂飞, 郁德专, 朱志鹏, 张...","[重庆市, 广西壮族自治区, 安徽省, 湖北省, 江苏省, 四川省, 陕西省, 浙江省, 吉...",0.88113,243.712757,3061.729798
2,"[于福利, 陶然, 王威, 刘树文, 张宏海, 杨秀欣, 田秋生, 王志财, 李桐丞, 李志...","[辽宁省, 江苏省, 天津市]",0.924584,48.738495,1058.75834
3,"[张业君, 向云福, 陶梓欣, 李剑林, 魏胜武, 陈君, 王新建, 高清燕, 刘宏伟, 吕...","[广东省, 天津市, 辽宁省, 河北省, 四川省, 湖北省, 广西壮族自治区, 河南省, 北...",0.929078,514.801934,4872.354807
4,"[李海林, 阳秋旺, 谢志胜, 李俊华, 许承凯, 谢志桂]","[广西壮族自治区, nan]",0.846154,274.150113,3443.229063
5,"[刘会, 韩进虎]",[广东省],0.921875,2234.871963,12070.8
6,"[张燕, 熊佳丽, 申亚坤, 郑志军, 程宝全, 于凤星, 李富蓉, 宇文义, 蒋飞亮, 俞...","[上海市, 北京市, 安徽省, 江苏省, 浙江省, 山东省, 河北省, 广东省]",0.898944,1946.455129,19554.524683
7,"[孙安民, 田蒙蒙, 陈天高, 谷战, 杜渺, 郭勇, 杨丽, 李魁伟, 赵佳斌, 代国海,...","[陕西省, 广东省, nan, 四川省, 云南省, 江苏省, 湖南省, 湖北省, 宁夏回族自...",0.666935,152.029765,1734.974739


In [107]:
df['group'].apply(lambda r: len(r))

0    36
1    54
2    14
3    36
4     6
5     2
6    16
7    36
Name: group, dtype: int64

In [108]:
df_spc = pd.DataFrame(columns=["group", "provinces", "success_rate", "object_money", "penalty"])
for i in clusters:
#     print(i)
    temp_ns1stcp = first_cases[first_cases['plaintiff'].isin(i)]
#     print("province of this group is: ", temp_ns1stcp['province'].unique())
    temp_ns1stcp_cc = len(temp_ns1stcp['case_id'].unique())
    temp_ns1stcp_sc = len(temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['case_id'].unique())
#     print("success_rate of this group is: ", temp_ns1stcp_sc/temp_ns1stcp_cc, "\n")
    df_spc = df_spc.append({"group":i, "provinces":temp_ns1stcp['province'].unique(), 
                    "success_rate": temp_ns1stcp_sc/temp_ns1stcp_cc, 
                    "object_money": temp_ns1stcp['objectmoney'].mean(), 
                    "penalty": temp_ns1stcp[temp_ns1stcp['is_success']=="TRUE"]['penalty'].mean()}, 
                   ignore_index=True)

In [109]:
df_spc

Unnamed: 0,group,provinces,success_rate,object_money,penalty
0,"[白世桥, 沈凯, 申元生, 周悟权, 张波, 李政, 丛李松, 郑建芳, 战伟东, 郑细海...","[浙江省, 北京市, 广东省, 上海市, 河南省, 黑龙江省, 天津市, 山东省, 重庆市,...",0.9518,878.979564,7005.227557
1,"[张正荣, 秦东, 况力彬, 晏勇, 蒋飞亮, 俞光新, 强大应, 周开礼, 谢志桂, 叶润...","[上海市, 重庆市, 安徽省, 广西壮族自治区, 湖北省, 江苏省, 浙江省, 四川省, 陕...",0.880352,253.431253,3121.548143
2,"[杨秀欣, 王玥明, 陶然, 魏胜武, 孟凡野, 魏世录, 张宏海, 田秋生, 于福利, 李...","[辽宁省, 天津市, 江苏省, 河北省, 北京市, 山东省, 上海市]",0.933721,179.037101,2450.829368
3,"[江金龙, 葛太玉, 张宝辉, 吕学鹏, 李剑林, 刘宏伟, 邹士伟, 胡一定, 王福群, ...","[广东省, 四川省, 湖北省, 广西壮族自治区, 北京市, 湖南省, 浙江省]",0.904895,729.098021,6262.52261
4,"[阎家明, 孙丁丁, 郑志军, 杨超, 申亚坤, 熊佳丽, 于凤星, 王会勇, 宇政义, 张...","[上海市, 北京市, 江苏省, 浙江省, 山东省, 河北省, 广东省]",0.899267,2154.884084,21751.144203
5,"[郭栋, 王新建, 郭夏天]","[广东省, 河南省]",0.827068,15.277059,312.676056
6,"[陈明江, 王福岭, 向云福, 杨林茂]",[广东省],0.972678,3180.404225,22248.809127


In [110]:
df_spc['group'].apply(lambda r: len(r))

0    32
1    64
2    32
3    14
4    13
5     3
6     4
Name: group, dtype: int64

<h4>Concat two types of community groups into one dataframe for further comparison</h4>

In [124]:
comgroups = pd.concat([df.add_prefix('lou_'), df_spc.add_prefix('spc_')], axis=1)

In [153]:
set(comgroups.loc[4, 'lou_group']).difference(set(comgroups.loc[1, 'spc_group']))
# After compare two groups, the first 3 groups in both louvain and spectral clustering 
# are similar, while the other 3 groups are different

set()

<h4>将律师信息添加进df和df_spc中</h4>

In [117]:
lawdf = pd.DataFrame(columns=["group", "lawyer", "case_count"])
for i in range(7):
    tempGroup = new_selected_1stcp[new_selected_1stcp['plaintiff'].isin(df['group'][i])]
    tempLawyers = list(tempGroup['lawyer'].drop_duplicates())
#     temp = new_selected_1stcp[new_selected_1stcp['lawyer'].isin(tempLawyers)]
    temp = tempGroup.groupby("lawyer")['case_id'].\
    nunique().reset_index().sort_values(by="case_id", ascending=False) # calculate case count for laywers within each community
    temp.rename(columns={"case_id": "case_count"}, inplace=True)
    temp = temp.assign(group=i)
    lawdf = lawdf.append(temp, ignore_index=True)
    
lawdf_spc = pd.DataFrame(columns=["group", "lawyer", "case_count"])
for i in range(7):
    tempGroup = new_selected_1stcp[new_selected_1stcp['plaintiff'].isin(df_spc['group'][i])]
    tempLawyers = list(tempGroup['lawyer'].drop_duplicates())
#     temp = new_selected_1stcp[new_selected_1stcp['lawyer'].isin(tempLawyers)]
    temp = tempGroup.groupby("lawyer")['case_id'].\
    nunique().reset_index().sort_values(by="case_id", ascending=False) # calculate case count for laywers within each community
    temp.rename(columns={"case_id": "case_count"}, inplace=True)
    temp = temp.assign(group=i)
    lawdf_spc = lawdf_spc.append(temp, ignore_index=True)

In [196]:
lawdf_spc[lawdf_spc['group']==0]

Unnamed: 0,group,lawyer,case_count
0,0,肖丽君,238
1,0,牛琨,180
2,0,万迎军,24
3,0,张朝阳,14
4,0,吴迪,14
...,...,...,...
149,0,张永权,1
150,0,张楠,1
151,0,张文斌,1
152,0,张国瑞,1


<h4>找一下G_3中community的子图</h4>

In [20]:
subgraphs = []
for i in range(7):
    subgraphs.append(G_3.subgraph(list(df['group'][i])))

<h4>找一下每一个group里面的高频律师</h4>

In [157]:
highFreqLaws = lawdf.groupby("group").head(5).reset_index(drop=True) #只取第一个lawyer的话图的连接太少
highFreqLawsSpc = lawdf_spc.groupby("group").head(5).reset_index(drop=True)

In [164]:
pd.concat([highFreqLaws.add_prefix("lou_"), highFreqLawsSpc.add_prefix("spc_")], axis=1) # lawyers in the first 4 groups are same

Unnamed: 0,lou_group,lou_lawyer,lou_case_count,spc_group,spc_lawyer,spc_case_count
0,0,肖丽君,239,0,肖丽君,238
1,0,牛琨,181,0,牛琨,180
2,0,徐洋,27,0,万迎军,24
3,0,万迎军,24,0,张朝阳,14
4,0,吴迪,14,0,吴迪,14
5,1,吴波,442,1,吴波,444
6,1,赵乾伟,342,1,赵乾伟,342
7,1,吴金梅,289,1,吴金梅,289
8,1,彭丹,221,1,彭丹,221
9,1,段理,126,1,段理,126


<h4>重新build一下以高频律师为基础的network</h4>

In [165]:
nnew_selected_1stcp = new_selected_1stcp[new_selected_1stcp['lawyer'].isin(list(highFreqLaws['lawyer']))]

In [166]:
newG = nx.Graph()
newG.add_nodes_from(plaintiffs)
law_nselected_1stcp = nnew_selected_1stcp.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
for i in range(1, len(law_nselected_1stcp)+1):
    a = list(law_nselected_1stcp[i-1:i]['plaintiff'].values[0])
    if len(a) > 1:
        newG.add_edges_from(list(itertools.combinations(a, 2)))

In [25]:
partition_ng = community_louvain.best_partition(newG)
partitions_ng = []
new_groups = []
for i in range(50): #看一下前十个community
    partitions_ng.append({k for k, v in partition_ng.items() if v==i})
for i in partitions_ng:
    if len(i) > 1:
        new_groups.append(i)
        print(i)

{'姚金东', '孙桂兰', '周开礼', '何林松', '罗生华', '李战江', '陈亚平', '朱志鹏', '贾伟', '王书培', '程祥', '罗伟', '贾涛', '王堂飞', '常晓恒', '张佩', '彭雪莲', '徐水江', '秦东', '李娟', '刘旺', '林海东', '胡勇', '强大应', '王伟华', '邓德波', '余啟红', '蔡兴杨', '张翼', '晏勇', '尹前林', '张伟', '贾龙', '刘庆生', '张望成', '胡玉宝', '况力彬', '杜文江', '郁德专', '叶润军'}
{'许智禄', '张琦', '郑细海', '吴保利', '白世桥', '徐桂锦', '马超', '杨照', '丛李松', '王少勇', '赵磊', '申元生', '周悟权', '矫仁辉', '刘占奎', '刘锐哲', '刘洋', '沈凯', '胡祥年', '李双琴', '王旭', '王飞', '李亮', '孙梦达', '李洪岩', '战伟东', '刘阳'}
{'张明亮', '李剑林', '邹士伟', '王福群', '李成学', '杨林茂', '刘宏伟', '葛太玉', '江金龙', '陈君'}
{'覃玉东', '张园', '李波', '李玉婵', '李军', '余定勇', '沈亮'}
{'孙丁丁', '郑志军', '张燕', '杨超', '于凤星'}
{'谢志胜', '李海林', '阳秋旺', '李俊华', '谢志桂', '许承凯'}
{'于福利', '李家亨', '杨秀欣', '薛洋', '赵铁亮', '王瑜', '王威', '田秋生', '李桐丞', '陶然', '张宏海', '刘俊梅', '李志伟', '薛文英', '刘树文', '魏世录', '王志财'}


In [26]:
new_subgraphs = []
for i in range(len(new_groups)):
    new_subgraphs.append(newG.subgraph(new_groups[i]))

<h4>比较前后两个community内部边的差</h4>

In [27]:
# 每个group前五个律师可以主导形成community
print(sum([len(i.edges()) for i in subgraphs]), sum([len(i.edges()) for i in new_subgraphs]))

1285 1044


<h4>去掉某一个group的高频律师重新build一下network</h4>

In [28]:
#去掉group0的高频律师
print(list(highFreqLaws['lawyer'][:5]))
nnew_selected_1stcp = new_selected_1stcp[~(new_selected_1stcp['lawyer'].isin(list(highFreqLaws['lawyer'][:5])))]

newG = nx.Graph()
newG.add_nodes_from(plaintiffs)
law_nselected_1stcp = nnew_selected_1stcp.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
for i in range(1, len(law_nselected_1stcp)+1):
    a = list(law_nselected_1stcp[i-1:i]['plaintiff'].values[0])
    if len(a) > 1:
        newG.add_edges_from(list(itertools.combinations(a, 2)))
group0_sub = newG.subgraph(df['group'][0])
print(len(group0_sub.edges()), len(G_3.subgraph(df['group'][0]).edges()))

['肖丽君', '牛琨', '徐洋', '万迎军', '吴迪']
59 346


In [29]:
#group1
highFreqLaws['lawyer'][5:10]
nnew_selected_1stcp = new_selected_1stcp[~(new_selected_1stcp['lawyer'].isin(list(highFreqLaws['lawyer'][5:10])))]
newG = nx.Graph()
newG.add_nodes_from(plaintiffs)
law_nselected_1stcp = nnew_selected_1stcp.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
for i in range(1, len(law_nselected_1stcp)+1):
    a = list(law_nselected_1stcp[i-1:i]['plaintiff'].values[0])
    if len(a) > 1:
        newG.add_edges_from(list(itertools.combinations(a, 2)))
group1_sub = newG.subgraph(df['group'][1])
print(len(group1_sub.edges()), len(G_3.subgraph(df['group'][1]).edges()))

22 26


In [194]:
resultDf = pd.DataFrame(columns=[
    "dominant_lawyers", 
    "new_com_edge", 
    "ori_com_edge", 
    "ori_group_member", 
    "new_group_member", 
    "dominant_proportion", 
    "ori_conductance", 
    "new_conductance"])

new_clusters = []
newG = nx.Graph()
newG.add_nodes_from(plaintiffs)

for i in range(7):
#     highFreqLaws['lawyer'][i*5:i*5+5]
    new_cluster = []

    # select group data
    nnew_selected_1stcp = new_selected_1stcp[new_selected_1stcp['plaintiff'].isin(df_spc['group'][i])]
    # remove dominant lawyer from that data
    nnew_selected_1stcp = nnew_selected_1stcp[~(nnew_selected_1stcp['lawyer'].isin(list(highFreqLawsSpc['lawyer'][i*5:i*5+5])))]
    
    law_nselected_1stcp = nnew_selected_1stcp.groupby('lawyer')['plaintiff'].unique().apply(tuple).reset_index()
    for j in range(1, len(law_nselected_1stcp)+1):
        a = list(law_nselected_1stcp[j-1:j]['plaintiff'].values[0])
        if len(a) > 1:
            new_cluster += a
            newG.add_edges_from(list(itertools.combinations(a, 2)))
            
    new_clusters.append(list(set(new_cluster)))
    group_sub = newG.subgraph(new_clusters[i])
    
    resultDf = resultDf.append({
        "dominant_lawyers": list(highFreqLawsSpc['lawyer'][i*5:i*5+5]), 
        "new_com_edge": len(group_sub.edges()), 
        "ori_com_edge": len(G_3.subgraph(df_spc['group'][i]).edges()), 
        "ori_group_member": len(df_spc['group'][i]), 
        "new_group_member": len(new_clusters[i]), 
        "dominant_proportion": 1 - (len(group_sub.edges()) / len(G_3.subgraph(df_spc['group'][i]).edges())), 
    }, ignore_index=True)
    
resultDf['ori_conductance'] = [nx.conductance(G_3, cluster_i) for cluster_i in df_spc['group']]
resultDf['new_conductance'] = [nx.conductance(newG, cluster_i) if len(cluster_i) > 0 else np.nan for cluster_i in new_clusters]
resultDf
# why new conductance all zero???

Unnamed: 0,dominant_lawyers,new_com_edge,ori_com_edge,ori_group_member,new_group_member,dominant_proportion,ori_conductance,new_conductance
0,"[肖丽君, 牛琨, 万迎军, 张朝阳, 吴迪]",59,312,32,27,0.810897,0.122363,0.0
1,"[吴波, 赵乾伟, 吴金梅, 彭丹, 段理]",278,741,64,57,0.624831,0.04031,0.0
2,"[王志扬, 顾雪微, 陈卉, 孟丹妮, 齐芳梅]",140,158,32,31,0.113924,0.122222,0.0
3,"[莫观培, 梁梭, 杨玉盟, 孙依丰, 钟永标]",33,53,14,14,0.377358,0.242857,0.0
4,"[王虹, 杨超, 张燕, 滕卫兴, 王瀛]",18,22,13,12,0.181818,0.254237,0.0
5,"[曾娜, 蔡泽宇, 袁媛, 叶丽明, 吕远霞]",0,3,3,0,1.0,0.142857,
6,"[利继梅, 崔峰, 谢泽烽, 黄道义, 杨玉盟]",2,3,4,3,0.333333,0.454545,0.0


In [189]:
nx.conductance(newG, new_clusters[1])

0.0

In [190]:
nx.write_gexf(newG, "/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/networks/lawyerG.gexf")

In [None]:
# 设置dominant阈值，找到不同group的dominant lawyers，find intersection of them去观察有没有律师流窜（活跃律师）