In [1]:
import jieba
import json
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
jieba.load_userdict("../resources/law_lexicon.txt")
DATA_PATH = '/Users/mac/Desktop/law_raw_data_mini/criminal_0215.json'
STOPWORD_PATH = '../resources/stop_word.txt'
RUN_LIMIT = 200000

BOWs = []
stop_words = set()

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/bc/vmq9t0692mb03xmqfsm130fm0000gn/T/jieba.cache
DEBUG:jieba:Dumping model to file cache /var/folders/bc/vmq9t0692mb03xmqfsm130fm0000gn/T/jieba.cache
Loading model cost 1.782 seconds.
DEBUG:jieba:Loading model cost 1.782 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


In [3]:
def preprocessing(line):
    content = line.get('court_idea')
    word_list = segment_to_words(content)
    word_list = remove_stop_words(word_list)
    BOW = build_BOW(word_list)
    BOWs.append(BOW)


def build_BOW(word_list):
    BOW = {}
    for word in word_list:
        BOW[word] = BOW.get(word, 0) + 1
    return BOW

def segment_to_words(sent):
    word_list = jieba.cut(sent)
    return word_list

def remove_stop_words(word_list):
    word_list = filter(lambda x: x.encode('utf8') not in stop_words, word_list)
    return word_list

def load_stop_word_list():
    stop_words = set()
    word_path = STOPWORD_PATH
    with open(word_path, 'r') as f:
        for line in f.readlines():
            # 去掉结尾的回车
            # NOTICE：如果停词表换了，记得这里可能需要一点修改。
            stop_words.add(line[:-1].strip())
    return stop_words

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        # 按主题可能性大小排序、输出。
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        # print data_samples[topic_idx]
        print
    print
    return message

In [4]:
stop_words =load_stop_word_list()
with open(DATA_PATH,'r') as file:
    position = 0
    try:
        while True:
            try:
                line = file.readline()
                line = json.loads(line)
                preprocessing(line)
                position += 1
            except ValueError:
                pass

            if position > RUN_LIMIT:
                break
    except EOFError:
        pass

In [6]:
def build_model():
    tfidf_transformer = TfidfTransformer()
    vectorizer = DictVectorizer()
    training_data = vectorizer.fit_transform(BOWs)
#     train_tfidf = tfidf_transformer.fit_transform(training_data)

    lda = LatentDirichletAllocation(n_topics=5, max_iter=200,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(training_data)
    keywords = print_top_words(lda, vectorizer.get_feature_names(), 20)
    maxtrix = lda.transform(training_data)
    
    return keywords, maxtrix

In [None]:
keywords, maxtrix = build_model()

In [40]:
import random
import linecache

CHECK_LIMIT = 10

for i in range(CHECK_LIMIT):
    try:
        r = random.randrange(1, RUN_LIMIT)
        line = linecache.getline(DATA_PATH, r)
        line = json.loads(line)
        top_distribution = maxtrix[r] 
        index = 0
        max_v = -999999
        for i, v in enumerate(top_distribution):
            if v > max_v:
                max_v = v
                index = i
        print 'topic', index
#   print 'judge result',line.get('judge_result')
        print 
        print line.get('court_idea')

        print 
    except ValueError:
        pass

topic 2

本院认为，被告人李丁伙同他人持械故意伤害他人身体，致一人轻伤，其行为已构成故意伤害罪，应依法予以惩处。公诉机关指控的犯罪事实清楚，证据确实充分，指控罪名成立。多名证人证言、被告人的供述及被害人的陈述予以相互印证，证实李丁参与持械殴打被害人，故其未参与殴打被害人的辩解本院不予采纳。被害人对本案的发生存在一定过错，可以酌情对被告人从轻处罚。为保护公民人身权利不受侵犯，根据《中华人民共和国刑法》第二百三十四条第一款、第二十五条第一款之规定，

topic 0

综上，本院认为：被告人毛某捏造并通过互联网传播虚伪事实，损害他人商业信誉，情节严重，其行为已构成损害商业信誉罪，应予惩处。公诉机关的指控成立。鉴于被告人毛某犯罪后能自动投案，可酌情从轻处罚；同时结合本案的具体情况，可予酌情从轻处罚。公诉机关提出对被告人毛某判处有期徒刑六个月至一年的量刑建议，可予采纳。依照《中华人民共和国刑法》第二百二十一条、第六十四条之规定，

topic 2

本院认为，被告人梅某以营利为目的开设赌场，抽头获利人民币50000余元，其行为已构成开设赌场罪，公诉机关指控的罪名成立。关于被告人梅某及其辩护人提出的被告人梅某仅获利1万余元的意见。本院经查认为，被告人梅某于2013年清明前后开设赌场，实际开设时间50余天，抽头获利5万余元的事实，有被告人梅某在公安侦查阶段的供述，证人胡某、朱某甲、杨某、余某、王某、应某某、刘某、陈某乙的证言予以证实，上述证人均经常在梅某赌场赌博，对梅某抽头获利的证言应予以采信，故梅某开设赌场抽头获利5万余元的事实清楚、证据充分，被告人梅某及其辩护人就此提出的意见，本院不予采纳。被告人梅某有劣迹，酌情从重处罚。被告人自愿认罪，酌情从轻处罚。被告人的违法所得应予以追缴。依照《中华人民共和国刑法》第三百零三条第二款、第六十四条之规定，

topic 2

本院认为，被告人闫某作为广西省北海市1040工程传销组织的领导者和组织者，扰乱经济社会秩序，其行为已构成组织、领导传销活动罪，公诉机关指控的罪名及事实成立，本院予以确认。案发后被告人闫某自动到案，如实供述犯罪事实，系自首，可从轻处罚。依照《中华人民共和国刑法》第二百二十四条、第五十二条、第五十三条、第六十七条、第七十二条、第七十三条之规定，

topic 2

本院认为，被告人李某某违反狩猎法规，在禁猎区、