<h1>Descriptive and analytical information about cases with lawyers and were sued by the first 200 active plaintiffs </h1>

In [2]:
#coding:utf-8
import networkx as nx
from networkx.algorithms import bipartite
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import os
import cn2an
import chinese2digits as c2d
import operator
import pickle
import plotly.express as px
import plotly
import plotly.graph_objects as go
import plotly as py
import plotly.offline as offline
offline.init_notebook_mode(connected=True)
from urllib.request import urlopen
import json
import requests
from plotly.subplots import make_subplots
from collections import Counter

In [3]:
all_cases = pd.read_csv('/Users/starice/OwnFiles/cityu/RA/case_study/data/total_extracted_result/all_cases.csv', encoding="utf-8")
all_cases['case_id'].drop_duplicates()
all_cases = all_cases[all_cases['defendant'] != all_cases['lawyer']] #去掉有不规范律师的案件（提取时有问题）
mapbox_access_token = "pk.eyJ1Ijoic3RhcmljZSIsImEiOiJjazN6Y2s5dTUxY2R6M2xxcHllbXk4YWFzIn0.lActFqLzqRWGn7dqr4BShw"
px.set_mapbox_access_token(mapbox_access_token)

In [4]:
base_url = "/Users/starice/Desktop/total_extracted_result/"
pre_dir = ['type1', 'type2', 'type3', 'type4']
dir_name = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']
dir_sname = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

<h2>看一下律师的整体分布</h2>

In [5]:
temp_cases = all_cases.dropna(subset=['lawyer'])
lawyer_cases = temp_cases.groupby('year')['lawyer'].nunique().reset_index().rename(columns={"lawyer":"lawyer_count"}).merge(all_cases.groupby('year')['case_id'].nunique().reset_index().rename(columns={"case_id":"case_count"}))
lawyer_cases['lawyer_proportion'] = lawyer_cases['lawyer_count'] / lawyer_cases['case_count']
lawyer_cases

Unnamed: 0,year,lawyer_count,case_count,lawyer_proportion
0,2014,250,729,0.342936
1,2015,868,2820,0.307801
2,2016,1389,5596,0.248213
3,2017,2431,13879,0.175157
4,2018,1665,8241,0.202039
5,2019,1470,7543,0.194883
6,2020,128,280,0.457143


In [8]:
fig = px.bar(
    lawyer_cases, 
    x = 'year', 
    y = ['lawyer_count', 'case_count']
)
fig.add_trace(
    go.Scatter(
        x = lawyer_cases['year'], 
        y = lawyer_cases['lawyer_proportion'], 
        mode = "lines", 
        name = "lawyer_proportion", 
        showlegend=True,
    )
)
fig.update_layout(
    barmode = "group",
    title = "Temporal Distribution of Lawyer Amount"
    
)
fig.update_yaxes(type="log")
fig.write_html("/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/figures/plaintiffs/html/temp_dist_lawyer.html")
fig.show()

In [9]:
#获取所有一审案件
first_cases = all_cases[all_cases['procedure']=="一审"]
print("一审案件数量： ", len(first_cases['case_id'].drop_duplicates()))

# 获取所有案件原告的节点度并排序
degree_1stplaintiffs = first_cases.groupby("plaintiff")['case_id'].unique().reset_index()
degree_1stplaintiffs['case_count'] = degree_1stplaintiffs['case_id'].apply(lambda r: len(r))
degree_1stplaintiffs.sort_values(by="case_count", inplace=True, ascending=False)
fps_200 = degree_1stplaintiffs[:200]

一审案件数量：  32295


In [10]:
new_selected_1stcases = first_cases[first_cases['plaintiff'].isin(fps_200['plaintiff'])]
new_selected_1stcases = new_selected_1stcases.drop(new_selected_1stcases[new_selected_1stcases['plaintiff'].isin(['XX', 'xx', '**'])].index)
new_selected_1stcases = new_selected_1stcases.groupby(['plaintiff', 'lawyer', 'province', 'city'])['case_id'].nunique().reset_index()
new_selected_1stcases['relationship'] = new_selected_1stcases[['plaintiff', 'lawyer']].apply(tuple, axis=1)
new_selected_1stcases.rename(columns={"case_id": "case_count"}, inplace=True)
new_selected_1stcases.head()

Unnamed: 0,plaintiff,lawyer,province,city,case_count,relationship
0,丛李松,刘新武,黑龙江省,哈尔滨市,1,"(丛李松, 刘新武)"
1,丛李松,刘金祥,黑龙江省,哈尔滨市,4,"(丛李松, 刘金祥)"
2,丛李松,吴迪,黑龙江省,哈尔滨市,1,"(丛李松, 吴迪)"
3,丛李松,张雪,黑龙江省,哈尔滨市,2,"(丛李松, 张雪)"
4,丛李松,徐秋敏,黑龙江省,哈尔滨市,7,"(丛李松, 徐秋敏)"


In [11]:
pl_graph = nx.Graph()
pl_graph.add_nodes_from(list(new_selected_1stcases['plaintiff'].drop_duplicates()), bipartite=0)
pl_graph.add_nodes_from(list(new_selected_1stcases['lawyer'].dropna()), bipartite=1)
relationship = list(new_selected_1stcases['relationship'])
case_counts = list(new_selected_1stcases['case_count'])
cities = list(new_selected_1stcases['city'])
provinces = list(new_selected_1stcases['province'])
pl_graph.add_edges_from([(relationship[i][0], relationship[i][1], \
                                 {'province': provinces[i], 'city': cities[i]})
                                for i in range(len(relationship))])
for i in range(len(relationship)):
    pl_graph[relationship[i][0]][relationship[i][1]]['weight'] = case_counts[i]

In [13]:
# save graph to text file
nx.write_edgelist(pl_graph, "/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/networks/plaintiff_lawyer.txt", delimiter=' ', data=['weight'])

In [14]:
# save graph to pickle file
nx.write_gpickle(pl_graph, "/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/networks/plaintiff_lawyer.gpickle")

In [15]:
graph = nx.read_gpickle("/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/networks/plaintiff_lawyer.gpickle")
graph['丛李松']

AtlasView({'刘新武': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '刘金祥': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 4}, '吴迪': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '张雪': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 2}, '徐秋敏': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 7}, '李新宏': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '李艳丽': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 3}, '梁伟': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '董彬': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '赵宏文': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 2}, '赵艳凤': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '金玉莹': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 2}, '闫玉香': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '高振娟': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 1}, '高晓秋': {'province': '黑龙江省', 'city': '哈尔滨市', 'weight': 2}})

In [17]:
# 看一下一审案件原告和律师组合的赢率是否会增强，是否惩罚金偏高
all_cases['is_lawyer_exist'] = all_cases['lawyer'].apply(lambda x: True if x is not np.nan else False)
temp_first_cases = all_cases[all_cases['procedure']=="一审"]
lawyer_case_cc = temp_first_cases.groupby('is_lawyer_exist')['case_id'].nunique().reset_index()
lawyer_case_cc.rename(columns={"case_id": "case_count"}, inplace=True)
lawyer_case_sc = temp_first_cases[temp_first_cases['is_success']=="TRUE"].groupby('is_lawyer_exist')['case_id'].nunique().reset_index()
lawyer_case_sc.rename(columns={"case_id": "success_count"}, inplace=True)
lawyer_case_sc.fillna(0, inplace=True)
# lawyer_case_penalty = temp_first_cases.groupby('is_lawyer_exist')['penalty'].mean().reset_index()
lawyer_case = lawyer_case_cc.merge(lawyer_case_sc, how="left")\
# .merge(lawyer_case_penalty, how="left")
lawyer_case['success_rate'] = lawyer_case['success_count'] / lawyer_case['case_count']
lawyer_case.head()

Unnamed: 0,is_lawyer_exist,case_count,success_count,success_rate
0,False,20186,17938,0.888636
1,True,12109,9713,0.802131


In [19]:
fig = px.box(
    temp_first_cases[temp_first_cases['is_success']=="TRUE"].replace(0, pd.np.nan).dropna(subset=['penalty']).sort_values(by='year'), 
    x="is_lawyer_exist", 
    y="penalty", 
    title="Penalty Difference with Lawyer"
)
fig.update_yaxes(type="log")
fig.write_html("/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/figures/plaintiffs/html/penalty_have_lawyer.html")
fig.show()
# 有律师的案件普遍惩罚金要高于没有律师的案件


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



In [20]:
#看一下标的和律师的联系
fig = px.box(
    temp_first_cases.replace(0, pd.np.nan).dropna(subset=['objectmoney']).sort_values(by='year'), 
    x="is_lawyer_exist", 
    y="objectmoney", 
    title="Object Money Difference with Lawyer"
)
fig.update_yaxes(type="log")
fig.write_html("/Users/starice/OwnFiles/cityu/RA/case_study/case_study_result/figures/plaintiffs/html/objm_have_lawyer.html")

fig.show()


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



In [27]:
# 是否有律师的案件标的越高？ 
# 看一下逻辑回归的结果
logistic_om_lw = temp_first_cases.replace(0, np.nan).dropna(subset=['objectmoney'])[['case_id', 'objectmoney', 'is_lawyer_exist']]

from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


X = logistic_om_lw[['objectmoney']].values
y = logistic_om_lw['is_lawyer_exist'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipe = make_pipeline(StandardScaler(), LogisticRegressionCV(cv=5, random_state=10))
pipe.fit(X_train, y_train)
# pipe.score(X_test, y_test)

# clf = LogisticRegressionCV(cv=5, random_state=50).fit(X_train, y_train)
# clf.score(X_test, y_test)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.61      0.92      0.73      3891
        True       0.75      0.27      0.40      3218

    accuracy                           0.63      7109
   macro avg       0.68      0.60      0.57      7109
weighted avg       0.67      0.63      0.58      7109



In [17]:
# y：赢率，惩罚金（律师作用不大），法院愿不愿意让被告付惩罚金
# 原告律师的团体其罚金会不会高，标的大不大，赢率如何？针对的被告有没有什么特点？

In [18]:
# louvain bipartite graph community detection(unfinished!)
# 如何将聚类的结果显示在python上呢？
import leidenalg as la
import matplotlib.pyplot as plt
import igraph as ig

In [19]:
G = ig.Graph.from_networkx(graph)
G.is_bipartite()

False