In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np

pd.set_option('max_colwidth', 500)

In [None]:
import matplotlib.pyplot as plt
import matplotlib 

matplotlib.style.use('ggplot')

In [None]:
conn = MongoClient('172.19.241.248', 20000)
db = conn.lawCase

In [None]:
statutes = []


set1 = db.paragraph
for item in set1.find():
    statutes.append([item['fullTextId'], item['codeOfCauseOfAction']])

In [None]:
statutes_df = pd.DataFrame(statutes, columns=['id', 'statute_code'])
statutes_df

In [None]:
seg_data = []

set2 = db.AJsegment
for item in set2.find():
    tmp = [item['fulltextid']]

    if item['plaintiffAlleges'] is not None and 'token' in item['plaintiffAlleges']:
        tmp.append(item['plaintiffAlleges']['token'])
    else:
        tmp.append(None)
        
    if item['defendantArgued'] is not None and 'token' in item['defendantArgued']:
        tmp.append(item['defendantArgued']['token'])
    else:
        tmp.append(None)
        
    if item['factFound'] is not None and 'token' in item['factFound']:
        tmp.append(item['factFound']['token'])
    else:
        tmp.append(None)
        
    seg_data.append(tmp)
len(seg_data)

In [None]:
seg_df = pd.DataFrame(seg_data, columns=['id', 'plaintiff', 'defendant', 'fact'])
seg_df

In [None]:
all_info = pd.merge(seg_df, statutes_df, on='id', how='inner')
all_info

In [None]:
def statute_col_helper(x):
    if len(x)==0:
        return None
    else:
        return x
all_info['statute_code'] = all_info['statute_code'].apply(statute_col_helper)


In [None]:
import re
p_re = re.compile(r'.*诉\s*称\s*，\s*|.*诉\s*称\s*：\s*')
d_re = re.compile(r'.*辩称\s*，|：\s*')
f_re = re.compile(r'.*审理\s*查明\s*，\s*|.*审理\s*查明\s*：\s*|.*本院.*事实.*?：\s*')


text = '''
依据 本院 审核 认定 的 证据 及 当事人 陈述 查明 的 事实 如下 ： 2011 年 10 月 13 日 ， 李 某某 向 沈 某某 借款 100000 元 ， 并 出具 借条 一份 ， 内容 为 ： “ 今 借到 沈 某某 人民币 拾 万元 整 ， 此 据 借款人 ： 李 某某 。 还款 日期 2012 年 4 ． 13 日 ” 并 注明 住址 及 身份证 号码 。 2012 年 10 月 22 日 ， 李 某某 向 沈 某某 借款 20000 元 ， 并 出具 借条 一份 ， 内容 为 ： “ 今 借到 沈 某某 人民币 贰 万元 整 ， 此 据 借款人 ： 李 某某 。 此...
'''
text = re.sub(f_re, '', text)
print(text)

In [None]:
def fact_col_helper(row):
    if pd.isnull(row['fact']):
        return None
    else:
        if f_re.search(row['fact']):
            return re.sub(f_re, '', row['fact']).strip()
        elif pd.notnull(row['plaintiff']) and p_re.search(row['plaintiff']):
            return re.sub(p_re, '', row['plaintiff']).strip()
        else:
            return None

In [None]:
example = all_info.head(1000)

In [None]:
import time
start = time.time()
# example['fact_1'] = example['fact'].apply(fact_col_helper)
example['fact_1'] = example.apply(fact_col_helper, axis=1)
end = time.time()
print(end-start)
example.loc[:,['fact','fact_1']]

In [None]:
all_info['text'] = all_info.apply(fact_col_helper, axis=1)

In [None]:
data_df = all_info.loc[:,['id', 'text', 'statute_code']]
# data_df.to_csv('data/data.csv', index=0)

In [None]:
data_df.dropna(how='any',inplace=True)

In [None]:
data_df['statute_code'].value_counts()

In [None]:
data_df

In [None]:
import pickle
with open('data/code_map.dict', 'rb') as fp:
    code_dict = pickle.load(fp)

In [None]:
data_df['label'] = data_df['statute_code'].apply(lambda x : code_dict[x] if x in code_dict else None)

In [None]:
data_df

In [None]:
tmp = data_df[(data_df['statute_code'] >= '9130') & (data_df['statute_code'] < '9300')]
len(tmp)

In [None]:
grouped = tmp.groupby('statute_code')
t = grouped['text'].agg(np.size)

In [None]:
t

In [None]:
data_df.groupby('label').count()

In [None]:
statutes = pd.read_csv('data/statute_code.csv')
statutes

In [None]:
level = re.compile(r'(\d{4})-9000')
def code_helper(x):
    tmp = level.search(x)
    if tmp:
        return tmp.group(1)
    else:
        return None
statutes['std'] = statutes['tree'].apply(code_helper)

In [None]:
statutes.groupby('std').count()

In [None]:
statutes_9130 = statutes[statutes['std']=='9130']
statutes_9130

In [None]:
statutes[statutes['std']=='9130']

In [None]:
def label(x):
    x = int(x)
    if x in code_dict:
        return code_dict[x]
    else:
        return None