## 1.读入数据

In [1]:
import pandas as pd
import numpy as np
import re
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from scipy import sparse


In [2]:
table = pd.read_excel('poem_v2.xlsx')
word_list_df = pd.read_excel('wordlist_v2.xlsx')

#选择繁体
table = table.drop('simple',axis=1) 
word_list_df = word_list_df.drop('simple',axis=1)

#hash，加快检索速度
word_list = {x:0 for x in word_list_df.to_dict('list')['word']} 
display(table)

Unnamed: 0,ID,Poem_id,line_number,content
0,1,4371,-100,##餞唐永昌( 一作餞唐郎中洛陽令)
1,2,4371,-1,$$沈佺期
2,3,4371,1,洛陽舊有( 一作出) 神明宰
3,4,4371,2,輦轂由來天地中
4,5,4371,3,餘邑政成何足貴
...,...,...,...,...
46272,46273,39205,-1,$$李舜弦
46273,46274,39205,1,飲筵中散酒微醒
46274,46275,39205,2,濛濛雨草瑤階溼
46275,46276,39205,3,鐘曉愁吟獨倚屏


## 2.分词

In [3]:
max_length = word_list_df.word.map(len).max()
def find_chinese(file):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', file)
    return chinese

def cut(string, word_list, max_length):
    # TODO
    res = []
    string = find_chinese(string)
    while len(string):
        s = string[0:max_length]
        for i in range(len(s)-1):
            if s in word_list:
                break
            s = s[0:-1] #'前向一位' 
        if s != '' :
            res.append(s) 
        string = string[len(s):] #'消除匹配项'
    return res
table['words'] = table['content'].apply(lambda x: ' '.join(cut(x, word_list, max_length)))
display(table)

Unnamed: 0,ID,Poem_id,line_number,content,words
0,1,4371,-100,##餞唐永昌( 一作餞唐郎中洛陽令),餞 唐 永昌 一作 餞 唐 郎中 洛陽 令
1,2,4371,-1,$$沈佺期,沈 佺 期
2,3,4371,1,洛陽舊有( 一作出) 神明宰,洛陽 舊有 一作 出 神明 宰
3,4,4371,2,輦轂由來天地中,輦轂 由來 天地 中
4,5,4371,3,餘邑政成何足貴,餘 邑 政成 何足 貴
...,...,...,...,...,...
46272,46273,39205,-1,$$李舜弦,李 舜弦
46273,46274,39205,1,飲筵中散酒微醒,飲 筵 中 散 酒 微 醒
46274,46275,39205,2,濛濛雨草瑤階溼,濛濛 雨 草 瑤階 溼
46275,46276,39205,3,鐘曉愁吟獨倚屏,鐘 曉 愁吟 獨倚 屏


In [4]:
# 按照空格分开，stack
split_words = table['words'].str.split(' ', expand=True).stack().rename('word').reset_index()
new_data = pd.merge(table['Poem_id'], split_words, left_index=True, right_on='level_0')
display(new_data)

Unnamed: 0,Poem_id,level_0,level_1,word
0,4371,0,0,餞
1,4371,0,1,唐
2,4371,0,2,永昌
3,4371,0,3,一作
4,4371,0,4,餞
...,...,...,...,...
198756,39205,46275,4,屏
198757,39205,46276,0,盡日
198758,39205,46276,1,池邊
198759,39205,46276,2,釣


## 3.1基于词-文档向量cosine距离的近义词挖掘

In [5]:
cnt = new_data.groupby(['Poem_id','word']).size() #单个Poem里出现的次数
total = new_data.groupby('Poem_id').size()  #单个Poem的总字数
df = new_data
df = pd.merge(df,cnt.rename('cnt'),on=['Poem_id','word'])
df = pd.merge(df,total.rename('total'),on='Poem_id')
df['tf'] = df['cnt'] / df['total']
display(df)

Unnamed: 0,Poem_id,level_0,level_1,word,cnt,total,tf
0,4371,0,0,餞,2,33,0.060606
1,4371,0,4,餞,2,33,0.060606
2,4371,0,1,唐,2,33,0.060606
3,4371,0,5,唐,2,33,0.060606
4,4371,0,2,永昌,1,33,0.030303
...,...,...,...,...,...,...,...
198756,39205,46275,4,屏,1,26,0.038462
198757,39205,46276,0,盡日,1,26,0.038462
198758,39205,46276,1,池邊,1,26,0.038462
198759,39205,46276,2,釣,1,26,0.038462


In [6]:
df = df.drop(['level_0','level_1'],axis=1).drop_duplicates() #消除Poem_id,word都重复的（该两个词在同一个诗里）
df['appeared'] = 1
appear = df.drop(['cnt','total'],axis=1).drop_duplicates().groupby('word').appeared.count() 
word_freq = appear.to_dict() #词在多少文章里出现

In [7]:
df['total_occurance'] = df['word'].map(word_freq)
df['idf'] = np.log2(len(new_data.Poem_id.unique()) / df['total_occurance'])
df['tf-idf'] = df['tf'] * df['idf']
display(df)
df = df.drop(['cnt','total','tf','idf','appeared'],axis=1)
df.reset_index(inplace=True,drop=True)

Unnamed: 0,Poem_id,word,cnt,total,tf,appeared,total_occurance,idf,tf-idf
0,4371,餞,2,33,0.060606,1,17,8.824304,0.534806
2,4371,唐,2,33,0.060606,1,185,5.380385,0.326084
4,4371,永昌,1,33,0.030303,1,3,11.326804,0.343236
5,4371,一作,2,33,0.060606,1,1711,2.171142,0.131584
7,4371,郎中,1,33,0.030303,1,54,7.156879,0.216875
...,...,...,...,...,...,...,...,...,...
198756,39205,屏,1,26,0.038462,1,11,9.452335,0.363551
198757,39205,盡日,1,26,0.038462,1,68,6.824304,0.262473
198758,39205,池邊,1,26,0.038462,1,15,9.004876,0.346341
198759,39205,釣,1,26,0.038462,1,30,8.004876,0.307880


In [8]:
display(len(df['word'].unique()),len(df['Poem_id'].unique()))

20122

7706

In [9]:
terms_df = df[df['total_occurance']>10].drop('total_occurance',axis=1)
terms_df = terms_df.set_index(['word','Poem_id']).unstack().fillna(0)
display(df,terms_df)

Unnamed: 0,Poem_id,word,total_occurance,tf-idf
0,4371,餞,17,0.534806
1,4371,唐,185,0.326084
2,4371,永昌,3,0.343236
3,4371,一作,1711,0.131584
4,4371,郎中,54,0.216875
...,...,...,...,...
192013,39205,屏,11,0.363551
192014,39205,盡日,68,0.262473
192015,39205,池邊,15,0.346341
192016,39205,釣,30,0.307880


Unnamed: 0_level_0,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf,tf-idf
Poem_id,4371,4373,4394,4403,4408,4417,4418,4429,4431,4432,...,38901,38902,38905,38907,38910,39040,39092,39200,39204,39205
word,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
一,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
一一,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
一事,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
一人,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
一作,0.131584,0.0,0.0,0.0,0.0,0.0,0.0,0.103388,0.0,0.108557,...,0.083505,0.0,0.086846,0.086846,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
齋,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
齡,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
龍,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
龍門,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
lst = [x[1] for x in terms_df.columns]
poem2idx = dict(zip(lst,range(len(lst))))
word2idx = dict(zip(terms_df.index,range(len(terms_df.index))))
idx2poem = dict(zip(range(len(lst)),lst))
idx2word = dict(zip(range(len(terms_df.index)),terms_df.index))
word_freq = df[['word','total_occurance']]
word_freq = word_freq.drop_duplicates()
word2freq = word_freq.set_index('word',drop=True).to_dict()['total_occurance']

In [11]:
terms_mat = terms_df.to_numpy()
sTerms_mat = sparse.lil_matrix(terms_mat)
word_rel = sparse.lil_matrix(1-pairwise_distances(sTerms_mat, metric="cosine"))
doc_rel = sparse.lil_matrix(1-pairwise_distances(sTerms_mat.T, metric="cosine"))

In [12]:
display(word_rel,doc_rel)

<2713x2713 sparse matrix of type '<class 'numpy.float64'>'
	with 1338659 stored elements in List of Lists format>

<7706x7706 sparse matrix of type '<class 'numpy.float64'>'
	with 21592446 stored elements in List of Lists format>

## 3.2基于上下文的近义词挖掘

In [13]:
word_dict = new_data.groupby(['word']).apply(lambda x: set(x.Poem_id)).to_dict()
def get_cross_merge(x):
    context = new_data[new_data.Poem_id.isin(word_dict[x.word])]
    m = pd.merge(pd.DataFrame({'word':[x.word]}),context,how='cross')
    return m
wc=new_data[['word']].drop_duplicates()
display(wc)
wc = wc.apply(get_cross_merge,axis=1).to_frame().reset_index()
wc = pd.concat(wc[0].tolist()).drop(['level_0','level_1'],axis=1).rename(columns={'word_x':'word','word_y':'context'})
wc

Unnamed: 0,word
0,餞
1,唐
2,永昌
3,一作
6,郎中
...,...
198656,髣佛
198673,竹錫
198712,青城
198718,物象


Unnamed: 0,word,Poem_id,context
0,餞,4371,餞
1,餞,4371,唐
2,餞,4371,永昌
3,餞,4371,一作
4,餞,4371,餞
...,...,...,...
21,禁花,39204,濃
22,禁花,39204,樹
23,禁花,39204,禁花
24,禁花,39204,開


In [14]:
word_dict

{'一': {4469,
  4470,
  4471,
  4472,
  4473,
  4751,
  4917,
  4965,
  5298,
  5312,
  5318,
  5363,
  5369,
  5370,
  5597,
  5791,
  5897,
  5902,
  5941,
  5944,
  5958,
  6434,
  6438,
  6443,
  6448,
  6454,
  6582,
  6900,
  6901,
  6911,
  6919,
  6920,
  6921,
  7557,
  7636,
  7689,
  7782,
  7810,
  7823,
  7824,
  7831,
  7910,
  8054,
  8057,
  8120,
  8156,
  8180,
  8241,
  8639,
  8647,
  8656,
  8718,
  8814,
  8940,
  8964,
  9969,
  10019,
  10024,
  10067,
  10346,
  10836,
  10842,
  10878,
  10892,
  10897,
  10911,
  10949,
  10950,
  10961,
  11088,
  11093,
  11256,
  11456,
  11546,
  11819,
  11973,
  12173,
  12182,
  12187,
  12197,
  12200,
  12459,
  12521,
  12542,
  12613,
  12653,
  12843,
  12855,
  12879,
  12881,
  12894,
  12899,
  12914,
  12921,
  13064,
  13203,
  13204,
  13220,
  13235,
  13406,
  13454,
  13458,
  13479,
  13708,
  13731,
  13752,
  13805,
  14070,
  14071,
  14200,
  14380,
  14423,
  14438,
  14449,
  14451,
  14460,
  14500

### 3.2.1 TF值

In [15]:
def get_tf(x):
    context = new_data[new_data.Poem_id.isin(word_dict[x.word])]
    tf = context['word'].value_counts().reset_index().rename(columns={'index':'word','word':'TF'})
    m = pd.merge(WORDLIST,tf,how='left').fillna(0)
    return m.TF.tolist()
WORDLIST = pd.DataFrame(new_data[['word']].drop_duplicates('word')).reset_index(drop=True)
display(WORDLIST)
WORDLIST['total'] = WORDLIST['word'].map(word2freq)
tf = WORDLIST[WORDLIST.total>10].apply(get_tf,axis=1).to_frame().reset_index()
tf.drop('index',inplace=True,axis=1)
tf.rename(columns={0:'TF'})
tf

Unnamed: 0,word
0,餞
1,唐
2,永昌
3,一作
4,郎中
...,...
20117,髣佛
20118,竹錫
20119,青城
20120,物象


Unnamed: 0,0
0,"[18.0, 4.0, 3.0, 8.0, 1.0, 2.0, 1.0, 1.0, 1.0,..."
1,"[4.0, 191.0, 3.0, 52.0, 2.0, 2.0, 4.0, 6.0, 1...."
2,"[5.0, 36.0, 1.0, 2474.0, 10.0, 21.0, 18.0, 14...."
3,"[2.0, 3.0, 1.0, 13.0, 54.0, 4.0, 4.0, 1.0, 1.0..."
4,"[2.0, 2.0, 1.0, 23.0, 3.0, 69.0, 2.0, 1.0, 1.0..."
...,...
2708,"[0.0, 0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2709,"[0.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2710,"[0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2711,"[0.0, 0.0, 0.0, 68.0, 1.0, 0.0, 0.0, 1.0, 0.0,..."


### 3.2.3 IDF

In [16]:
wc2 = pd.DataFrame(wc.groupby('context').word.nunique().rename('DF')).reset_index()
wc2['IDF'] = np.log2(len(new_data.index) / wc2['DF'])
display(wc2)

Unnamed: 0,context,DF,IDF
0,一,5391,5.204338
1,一一,275,9.497387
2,一丈,68,11.513212
3,一上,132,10.556281
4,一世,92,11.077113
...,...,...,...
20117,龜山,97,11.000762
20118,龜文,27,12.845788
20119,龜榼,26,12.900235
20120,龜茲,55,11.819315


In [17]:
idf = wc2['IDF'].tolist()
idf

[5.204337985665881,
 9.497387371484958,
 11.51321233864664,
 10.556281060538526,
 11.077113223839966,
 11.81931546637232,
 13.60067517989698,
 13.015712679175824,
 12.900235461755887,
 9.133069629813981,
 11.124941748930581,
 9.497387371484958,
 11.956818990122255,
 10.611990493124814,
 11.928249837925485,
 9.86596555967114,
 4.3354996891080635,
 11.015712679175824,
 11.278747085009616,
 9.606321743038121,
 10.49215072311881,
 11.333888639202078,
 13.352747666453395,
 11.872754725333781,
 9.928249837925485,
 8.69378458428846,
 10.556281060538526,
 10.845787677733512,
 9.073198173836584,
 11.243123175278896,
 12.900235461755887,
 12.046086328219342,
 10.430750178454668,
 11.928249837925485,
 11.225635748550054,
 9.502643096936453,
 9.96405055935333,
 7.599266985504171,
 11.51321233864664,
 10.646478869510105,
 10.886429662230857,
 10.287792224612625,
 9.872754725333781,
 10.092880539698283,
 9.730310460313575,
 12.077113223839966,
 11.49215072311881,
 9.914174652713761,
 10.243123175278

### 3.2.3 TF-IDF

In [18]:
display(tf,idf)

Unnamed: 0,0
0,"[18.0, 4.0, 3.0, 8.0, 1.0, 2.0, 1.0, 1.0, 1.0,..."
1,"[4.0, 191.0, 3.0, 52.0, 2.0, 2.0, 4.0, 6.0, 1...."
2,"[5.0, 36.0, 1.0, 2474.0, 10.0, 21.0, 18.0, 14...."
3,"[2.0, 3.0, 1.0, 13.0, 54.0, 4.0, 4.0, 1.0, 1.0..."
4,"[2.0, 2.0, 1.0, 23.0, 3.0, 69.0, 2.0, 1.0, 1.0..."
...,...
2708,"[0.0, 0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2709,"[0.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2710,"[0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2711,"[0.0, 0.0, 0.0, 68.0, 1.0, 0.0, 0.0, 1.0, 0.0,..."


[5.204337985665881,
 9.497387371484958,
 11.51321233864664,
 10.556281060538526,
 11.077113223839966,
 11.81931546637232,
 13.60067517989698,
 13.015712679175824,
 12.900235461755887,
 9.133069629813981,
 11.124941748930581,
 9.497387371484958,
 11.956818990122255,
 10.611990493124814,
 11.928249837925485,
 9.86596555967114,
 4.3354996891080635,
 11.015712679175824,
 11.278747085009616,
 9.606321743038121,
 10.49215072311881,
 11.333888639202078,
 13.352747666453395,
 11.872754725333781,
 9.928249837925485,
 8.69378458428846,
 10.556281060538526,
 10.845787677733512,
 9.073198173836584,
 11.243123175278896,
 12.900235461755887,
 12.046086328219342,
 10.430750178454668,
 11.928249837925485,
 11.225635748550054,
 9.502643096936453,
 9.96405055935333,
 7.599266985504171,
 11.51321233864664,
 10.646478869510105,
 10.886429662230857,
 10.287792224612625,
 9.872754725333781,
 10.092880539698283,
 9.730310460313575,
 12.077113223839966,
 11.49215072311881,
 9.914174652713761,
 10.243123175278

In [19]:
def mul(x):
    return [i * j for i,j in zip(x,idf)]
    
TFIDF = tf[0].apply(mul)
display(TFIDF)

0       [93.67808374198586, 37.98954948593983, 34.5396...
1       [20.817351942663525, 1814.000987953627, 34.539...
2       [26.021689928329405, 341.9059453734585, 11.513...
3       [10.408675971331762, 28.492162114454874, 11.51...
4       [10.408675971331762, 18.994774742969916, 11.51...
                              ...                        
2708    [0.0, 0.0, 0.0, 126.67537272646231, 0.0, 0.0, ...
2709    [0.0, 9.497387371484958, 0.0, 63.3376863632311...
2710    [0.0, 0.0, 0.0, 42.2251242421541, 0.0, 0.0, 0....
2711    [0.0, 0.0, 0.0, 717.8271121166198, 11.07711322...
2712    [0.0, 0.0, 0.0, 21.11256212107705, 0.0, 0.0, 0...
Name: 0, Length: 2713, dtype: object

### 3.3.4 转换成二维表、矩阵

In [20]:
TFIDF

0       [93.67808374198586, 37.98954948593983, 34.5396...
1       [20.817351942663525, 1814.000987953627, 34.539...
2       [26.021689928329405, 341.9059453734585, 11.513...
3       [10.408675971331762, 28.492162114454874, 11.51...
4       [10.408675971331762, 18.994774742969916, 11.51...
                              ...                        
2708    [0.0, 0.0, 0.0, 126.67537272646231, 0.0, 0.0, ...
2709    [0.0, 9.497387371484958, 0.0, 63.3376863632311...
2710    [0.0, 0.0, 0.0, 42.2251242421541, 0.0, 0.0, 0....
2711    [0.0, 0.0, 0.0, 717.8271121166198, 11.07711322...
2712    [0.0, 0.0, 0.0, 21.11256212107705, 0.0, 0.0, 0...
Name: 0, Length: 2713, dtype: object

In [22]:
M = TFIDF.to_numpy()
arr = []
for i in M:
    arr.append(i)
M = np.array(arr)
np.savetxt("tfidf.csv", M, delimiter=",")
M

array([[  93.67808374,   37.98954949,   34.53963702, ...,    0.        ,
           0.        ,    0.        ],
       [  20.81735194, 1814.00098795,   34.53963702, ...,    0.        ,
           0.        ,    0.        ],
       [  26.02168993,  341.90594537,   11.51321234, ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ]])

In [23]:
sparse_M = sparse.lil_matrix(M)
rel = sparse.lil_matrix(1-pairwise_distances(sparse_M, metric="cosine"))
rel

<2713x2713 sparse matrix of type '<class 'numpy.float64'>'
	with 7360369 stored elements in List of Lists format>

In [24]:
rel.todense()

matrix([[1.        , 0.19871826, 0.37815449, ..., 0.08696722, 0.09894964,
         0.07150482],
        [0.19871826, 1.        , 0.31935075, ..., 0.11807408, 0.18956892,
         0.066039  ],
        [0.37815449, 0.31935075, 1.        , ..., 0.21939945, 0.40894026,
         0.11751335],
        ...,
        [0.08696722, 0.11807408, 0.21939945, ..., 1.        , 0.08959695,
         0.04738415],
        [0.09894964, 0.18956892, 0.40894026, ..., 0.08959695, 1.        ,
         0.04802765],
        [0.07150482, 0.066039  , 0.11751335, ..., 0.04738415, 0.04802765,
         1.        ]])

## 4.观察数据

### 4.1 词向量

In [25]:
display(word2idx['山月'])
word_rel.toarray()[780]

780

array([0.00723809, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [26]:
lst = word_rel[780].toarray().tolist()[0]
lst = [(x,y) for x,y in zip(range(len(lst)),lst)]
lst.sort(reverse=True,key=lambda x:x[1])
lst

[(780, 1.0),
 (918, 0.14188846365742513),
 (1383, 0.1381022567629029),
 (1290, 0.11110398727679272),
 (318, 0.10635337570415959),
 (1267, 0.09282717015978881),
 (2557, 0.09001383122034712),
 (1103, 0.08702173336585473),
 (1003, 0.08678214972615816),
 (2658, 0.08364810471299189),
 (486, 0.08029939638433381),
 (126, 0.07416172616770766),
 (1541, 0.07357463669657849),
 (120, 0.07280553053812477),
 (532, 0.07256667901125535),
 (579, 0.07168282774495394),
 (1371, 0.06692891877823037),
 (334, 0.06609299764182008),
 (1002, 0.06537332841838306),
 (2318, 0.06266714462798761),
 (584, 0.06255265307993607),
 (970, 0.05932557609444267),
 (508, 0.05693354974655984),
 (128, 0.05542781088282134),
 (936, 0.055384561659935994),
 (828, 0.054226245695902286),
 (1799, 0.05374469279726668),
 (2552, 0.052442846559781486),
 (959, 0.051471742798645415),
 (1164, 0.0512212604356751),
 (1458, 0.050878359781962756),
 (1963, 0.05086027296167983),
 (878, 0.050797770413531396),
 (601, 0.05048755499258961),
 (1218, 0.

In [27]:
terms_df.loc[idx2word[239]].sort_values(ascending=False)

        Poem_id
tf-idf  23260      0.328971
        31588      0.286062
        31578      0.274143
        38702      0.274143
        19980      0.263177
                     ...   
        19927      0.000000
        19926      0.000000
        19925      0.000000
        19924      0.000000
        39205      0.000000
Name: 來, Length: 7706, dtype: float64

In [28]:
keywords = ['明月','山月']

In [29]:
def get_similar(keyword):
    res = {}
    num = word2idx[keyword]
    lst = word_rel[num].toarray().tolist()[0]
    lst = [(x,y) for x,y in zip(range(len(lst)),lst)] #word_id, cov(num,word_id)
    lst.sort(reverse=True,key=lambda x:x[1]) 
    for i in lst[:5]: #选前五个最接近的字
        res[idx2word[i[0]]]=[]
        tmp = terms_df.loc[idx2word[i[0]]].sort_values(ascending=False)
        for idx,j in zip(range(5),tmp.head(5).index): # (Poem_id,tf-idf)
            res[idx2word[i[0]]].append((j[1],tmp[idx]))
    return res

In [30]:
poems_by_keywords = {}
for i in keywords:
    poems_by_keywords.update(get_similar(i))
print(poems_by_keywords)

{'明月': [(31695, 0.7098005016225041), (13811, 0.5323503762168781), (20300, 0.5323503762168781), (7027, 0.3992627821626586), (28205, 0.3992627821626586)], '夜': [(23261, 0.538088479908745), (36000, 0.4384424651108292), (19804, 0.4384424651108292), (14366, 0.4384424651108292), (11813, 0.4384424651108292)], '王昌': [(5893, 0.4120515220618254), (5898, 0.4120515220618254), (5892, 0.38915977083616843), (30770, 0.3686776776342648), (5896, 0.3686776776342648)], '齡': [(5898, 0.3931145753037452), (5893, 0.3931145753037452), (5892, 0.37127487667575937), (5896, 0.35173409369282466), (5901, 0.35173409369282466)], '思君': [(17333, 0.5560196971777795), (31695, 0.5251297140012362), (15882, 0.49749130800117114), (28324, 0.49749130800117114), (22861, 0.450111183429631)], '山月': [(7074, 0.8009849350016051), (5350, 0.5418427501481445), (5901, 0.48480667118518195), (27481, 0.4605663376259229), (24895, 0.4386346072627837)], '水流': [(37216, 0.5355536205059969), (4903, 0.4791795551895761), (31967, 0.41383688857281575

In [31]:
lst = []
for i in poems_by_keywords.keys():
    for j in poems_by_keywords[i]:
        lst .append(j+(i,))
lst.sort(reverse=True,key=lambda x:x[1]) 
print(lst)
lst = [(x[0],x[2]) for x in lst]
lst

[(7074, 0.8009849350016051, '山月'), (27481, 0.7452334852022253, '冰'), (31695, 0.7098005016225041, '明月'), (17333, 0.5560196971777795, '思君'), (5350, 0.5418427501481445, '山月'), (23261, 0.538088479908745, '夜'), (37216, 0.5355536205059969, '水流'), (13811, 0.5323503762168781, '明月'), (20300, 0.5323503762168781, '明月'), (31695, 0.5251297140012362, '思君'), (15882, 0.49749130800117114, '思君'), (28324, 0.49749130800117114, '思君'), (17698, 0.4908844194704418, '楚水'), (33811, 0.4908844194704418, '楚水'), (5901, 0.4908844194704418, '楚水'), (5901, 0.48480667118518195, '山月'), (22215, 0.48079579690466145, '冰'), (4903, 0.4791795551895761, '水流'), (27481, 0.4605663376259229, '山月'), (22861, 0.450111183429631, '思君'), (22337, 0.4441335223780188, '楚水'), (24895, 0.4386346072627837, '山月'), (36000, 0.4384424651108292, '夜'), (19804, 0.4384424651108292, '夜'), (14366, 0.4384424651108292, '夜'), (11813, 0.4384424651108292, '夜'), (31967, 0.41383688857281575, '水流'), (23412, 0.41383688857281575, '水流'), (32003, 0.41383688857281575

[(7074, '山月'),
 (27481, '冰'),
 (31695, '明月'),
 (17333, '思君'),
 (5350, '山月'),
 (23261, '夜'),
 (37216, '水流'),
 (13811, '明月'),
 (20300, '明月'),
 (31695, '思君'),
 (15882, '思君'),
 (28324, '思君'),
 (17698, '楚水'),
 (33811, '楚水'),
 (5901, '楚水'),
 (5901, '山月'),
 (22215, '冰'),
 (4903, '水流'),
 (27481, '山月'),
 (22861, '思君'),
 (22337, '楚水'),
 (24895, '山月'),
 (36000, '夜'),
 (19804, '夜'),
 (14366, '夜'),
 (11813, '夜'),
 (31967, '水流'),
 (23412, '水流'),
 (32003, '水流'),
 (5893, '王昌'),
 (5898, '王昌'),
 (10906, '楚水'),
 (7027, '明月'),
 (28205, '明月'),
 (5898, '齡'),
 (5893, '齡'),
 (17698, '冰'),
 (5892, '王昌'),
 (8123, '冰'),
 (5892, '齡'),
 (30770, '王昌'),
 (5896, '王昌'),
 (17672, '冰'),
 (5896, '齡'),
 (5901, '齡')]

In [32]:
display(word_rel.mean(),word_rel.sum(),word_rel.nonzero()[0].shape)
word_rel.sum()/word_rel.nonzero()[0].shape[0]

0.004973575359645252

36607.34989629676

(1338659,)

0.027346284525257558

In [33]:
x = pd.DataFrame(word_rel.todense().reshape(-1,1))

In [34]:
x = x[x[0] != 1]
x = x[x[0] != 0]

In [35]:
np.percentile(x,[25,50,99.95])

array([0.01250317, 0.02027443, 0.25525469])

In [36]:
np.percentile(df['tf-idf'],[25,50,75,95])

array([0.19724543, 0.27265596, 0.37520316, 0.52167192])

### 4.2 上下文

In [37]:
x = pd.DataFrame(rel.todense().reshape(-1,1))

In [38]:
np.percentile(x,[50,95,99.95])

array([0.1951849 , 0.32100221, 0.68915382])

In [39]:
np.percentile(df['tf-idf'],[25,50,75,95])

array([0.19724543, 0.27265596, 0.37520316, 0.52167192])

## 储存变量

In [40]:
import xml.dom.minidom
import xml.sax
DOMTree = xml.dom.minidom.parse("tang300.xml")
collection = DOMTree.documentElement
 
# 在集合中获取所有诗句
poems = collection.getElementsByTagName("作业用唐诗")

class Poem():
    def __init__(self,_id, _t, _a = "", _c = ()):
        self.id = _id
        self.title = _t
        self.author = _a
        self.content = _c
        
    def __str__(self):
        msg = f"{self.author}《{self.title}》\n\n"
        for i in self.content:
            msg += i+"\n\n"
        return msg
        
poem_dict = {}
author_dict = {}

for line in poems:
    f = lambda x : line.getElementsByTagName(x)[0].childNodes[0].data
    poem_id, line_number, contance = int(f('Poem_id')), int(f('line_number')), f('contance').strip("$#")
    # Title
    if line_number == -100:
        poem_dict[poem_id] = Poem(poem_id,contance)
    # Author
    elif line_number == -1:
        poem_dict[poem_id].author = contance
        if contance in author_dict.keys():
            author_dict[contance].append(poem_dict[poem_id])
        else:
            author_dict[contance]=[poem_dict[poem_id]]
    # Content
    elif 1 <= line_number <= 10:
        poem_dict[poem_id].content += (contance,)

In [41]:
import pickle
# Saving the objects:
with open('objs.pkl', 'wb') as f:  
    pickle.dump([rel,word2freq,word_rel,doc_rel,poem2idx,word2idx,idx2poem,idx2word,terms_df,poem_dict,author_dict], f)