In [17]:
import pandas as pd
import numpy as np

# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# word2vec
from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence

# scatter plot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 解决matplotlib以及seaborn中文字方块的问题
from pylab import mpl
import seaborn as sns

# 3D plot
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

# k-means
from sklearn.cluster import KMeans
from sklearn import cluster
from sklearn import metrics

# 分词方程： seperater
import jieba
import re
import jieba.posseg as pseg

In [149]:
icd9v2 = pd.read_csv(r'~\Desktop\icd9_cm3\icd9_cm3.csv',encoding = 'gb18030')
icd9v3 = icd9v2[icd9v2['detail_name'] != '(null)']
icd9v3 = icd9v3.reset_index(drop=True)
icd9v3['detail_name'].describe()

count       10216
unique       9658
top       子宫楔形切除术
freq            3
Name: detail_name, dtype: object

### 制作字典
- 目标：合并原先被错误分词的词语，去掉没有意义的高频次
- 资源来源
    - 北京大学云中心的“提交-字典数据”
    - [王晔](wangye@wondersgroup.com)提供的“组织、部位与治疗方式.xlsx与新词探查工具

In [6]:
# 已知字典导入
body_part = pd.read_excel(r'C:\Users\MYTh_\Desktop\组织、部位与治疗方式.xlsx',encoding = 'gb18030')
body_part2 = pd.read_excel(r'C:\Users\MYTh_\Desktop\提交-词典数据.xlsx',encoding = 'gb18030',sheetname = '手术及治疗')

# 未知字典制作：通过正反向最大词向量在原数据中发现新词
fs = pd.read_csv(r'~\Desktop\1.csv',encoding = 'utf-8', header = [-1])
bs = pd.read_csv(r'~\Desktop\2.csv',encoding = 'utf-8', header = [-1])

In [35]:
fsbox = []

for word in fs[0]:
    temp = word.split(' ')
    for i in range(len(temp)):
#        if len(temp[i]) == 1 or len(temp[i]) > 6:   #只选取字符长度 < = 5 的术式名称
#            continue
#        else:
            fsbox.append(temp[i])

bsbox = []

for word in bs[0]:
    temp = word.split(' ')
    for i in range(len(temp)):
        bsbox.append(temp[i])     

In [153]:
#字典建立
dictionary = []

#导入第一本字典
for i in range(len(body_part.columns)):
    temp = body_part[body_part.columns[i]].dropna()    
    for j in range(len(temp)):
        dictionary.append(temp[j])

#导入第二本字典
for i in range(len(body_part2['词语'])):
    if len(str(body_part2['词语'][i])) <  5:
        dictionary.append(len(body_part2['词语'][i]))
    else:
        continue

# 导入正向查找字典
for word in fsbox:
    dictionary.append(word)
    
# 导入反向查找字典
for word in bsbox:
    dictionary.append(word)

np.savetxt(r'C:\Users\MYTh_\Desktop\wyfc_2015\dictionary.txt', dictionary, fmt='%s',encoding = 'utf-8')

In [44]:
# 不同结构的字典，为了后续的分析而建立
list_dic = []

for word in dictionary:
    list_dic.append(re.findall('[\u4E00-\u9FA5A-Za-z]+',str(word)))   

In [19]:
# 得到并保存去掉不需要字符且没有被分割的手术名称

for i in range(len(icd9v3.detail_name)):      
    hans = re.findall('[\u4E00-\u9FA5A-Za-z]+',icd9v3.detail_name[i])
    combine = ''.join(hans)
    icd9v3.loc[i, 'clean_dn'] = combine

np.savetxt(r'C:\Users\MYTh_\Desktop\wyfc_2015\test.txt', icd9v3.clean_dn, fmt='%s',encoding = 'utf-8')

In [118]:
# 分词方程： seperater

for word in dic:
    jieba.suggest_freq(word, True)   #导入字典

def seperater(param):
        
    hans_before = "".join(re.findall('[\u4E00-\u9FA5A-Za-z]+',param))
    hans_after = re.sub(u"(术+)|(的+)|(病+)","",hans_before)
    after_seg = ''.join(hans_after)
    
    seg_list = jieba.cut(after_seg, cut_all=False)
    temp = " ".join(seg_list)

    return temp;

In [119]:
# 执行分词并保存
for i in range(len(icd9v3)):
    icd9v3.loc[i, 'tokens'] = seperater(icd9v3.detail_name[i])
    
np.savetxt(r'C:\Users\MYTh_\Desktop\icd9_cm3\segment.txt', icd9v3.tokens.values, fmt='%s',encoding = 'utf-8')   

### 特征词提取
一般情况下，名词和动词比其他词性的词重要。另外，词语包括的字数越多，包含的信息量越大。论文中定义了一种基于词性和词长度的特征词权重计算公式，即
$Weight{w_{id}} = \lambda Weight_{pos}(w_{id})+(1-\lambda)Weight_{len}(w_{id})$
                
式中：$Weight(w_{id})$表示词语$w_i$在文本$d$中的权重，$Weight_{pos}(w_{id})$表示$$w_i在文本d中的词性权重，$Weight_{len}(w_{id})$表示$w_i$在文本d中的长度权重，$\lambda$和$(1-\lambda)$为加权系数，$\lambda$取经验值0.6。$Weight_{pos}(w_{id})$和$Weight_{len}(w_{id})$的具体计算公式为



In [133]:
# 权重分词 
posdf = pd.DataFrame()

import jieba.posseg as pseg

for i in range(len(icd9v3.tokens)):
    words = pseg.cut(icd9v3.tokens[i])
    for word1,j in zip(words,range(33)):
        posdf.loc[i,j] = '%s'%word1

In [140]:
jieba.suggest_freq('引导下', True)
words = pseg.cut('引导下')
for word in words:
    print ('%s'  % word)

引导下/x


In [134]:
token_pos = posdf[[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,31,32]]

### 词向量模型训练

In [122]:
# 词向量训练
model = Word2Vec(sentences = LineSentence(r'~\Desktop\icd9_cm3\segment.txt'),
                 size = 500, 
                 min_count = 1,
                 sg=1)  
model.save('op_model.model')

  "C extension not loaded, training will be slow. "


In [130]:
# 高频词查找

freq_count = []

for w in model.wv.vocab:
        freq_count.append([w,model.wv.vocab[w].count])
token_freq = pd.DataFrame(data = freq_count, columns = ['token_name','frequency'])
np.savetxt(r'C:\Users\MYTh_\Desktop\icd9_cm3\high_freq.txt', token_freq.sort_values(by = ["frequency"],ascending = False), fmt='%s',encoding = 'utf-8')
token_freq.sort_values(by = ["frequency"],ascending = False).head(5)

In [55]:
hf_writer = pd.ExcelWriter(r'C:\Users\MYTh_\Desktop\icd9_cm3\high_freq.xlsx')
token_freq.sort_values(by = ["frequency"],ascending = False).to_excel(hf_writer,'Sheet1')
hf_writer.save()

In [123]:
# 训练范围：全部词
# 求出每个手术名称的词向量均值
vector_list = []

for i in range(len(icd9v3.tokens)):
    vector_list.append(np.mean(model[list(icd9v3.tokens[i].split(' '))], axis = 0))

  after removing the cwd from sys.path.


In [71]:
# 训练范围：匹配词
# 求出每个手术名称的词向量均值
vector_list = []

for i in range(len(matched_tokens)):
    vector_list.append(np.mean(model[list(matched_tokens[i].split(' '))], axis = 0))

  """


In [124]:
# k均值聚类
# 聚类数量：默认4000；如果需要测试匹配词需要调低
kmeans = cluster.KMeans(n_clusters = 4000 ,max_iter = 2000)
kmeans.fit(vector_list)
labels= kmeans.predict(vector_list)

In [152]:
# 标记所有手术名称并保存
icd9v3["label"] = labels
icd9v3_sorted = icd9v3.sort_values(by = ['label'], ascending = (True))
labeled_icd9 = icd9v3_sorted[['detail_code','detail_name','label']]
writer = pd.ExcelWriter(r'C:\Users\MYTh_\Desktop\icd9_cm3\labeled_ICD9-CM3_test.xlsx')
labeled_icd9.to_excel(writer,'Sheet1')
writer.save()

In [148]:
print("匹配成功的分词有：" + str(len(matched_tokens)))
print("匹配失败的分词有：" + str(len(unmatched_tokens)))

匹配成功的分词有：7546
匹配失败的分词有：2670


### 测试代码
1. 匹配词分类查看
2. 匹配词计数

In [81]:
# 匹配词单独分类文件保存

matched = pd.DataFrame()
matched['sp_names'] = matched_tokens
matched["label"] = labels

matched = matched.sort_values(by = ['label'], ascending = (True))

for i in range(len(matched)):
    matched.loc[i,'detail_name'] = "".join(matched.sp_names[i].split(' '))

writer = pd.ExcelWriter(r'C:\Users\MYTh_\Desktop\icd9_cm3\match-labeled_ICD9-CM3_test.xlsx')
matched.to_excel(writer,'Sheet1')
writer.save()

Unnamed: 0,sp_names,label,detail_name
4692,女性 去势 术,0,女性去势术
1248,咽鼓管 通气 术,0,咽鼓管通气术
1250,咽鼓管 通气 术,0,咽鼓管通气术
4858,无痛 刮宫 术,0,无痛刮宫术
5166,下颌 磨削 术,0,下颌磨削术


In [147]:
# 匹配/不匹配词计数

matched_tokens = []
unmatched_tokens= []

for i in range(len(icd9v3.tokens)):
    temp = icd9v3.tokens[i].split(' ')
    for word in temp:
        if word in dictionary:
            matched_tokens.append(icd9v3.tokens[i])
            break
        else:
            unmatched_tokens.append(icd9v3.tokens[i])
            break  
            
print("匹配成功的分词有：" + str(len(matched_tokens)))
print("匹配失败的分词有：" + str(len(unmatched_tokens)))