## 1.1 对poem.xlsx中的诗歌进行向前最大匹配分词，词表在wordlist.xlsx中已经给出

注意：有些字符可能不在词表中（out-of-vocabulary），可以保留这些字符，也可以直接去掉

In [1]:
# 读取文件，如有报错，可根据报错信息安装xlrd或者openpyxl
import pandas as pd
import numpy as np
table = pd.read_excel('poem.xlsx')
word_list_df = pd.read_excel('wordlist.xlsx')

In [2]:
# 计算wordlist中词语的最大长度max_length
# TODO
max_length = word_list_df.word.map(len).max()
print(f"max_length={max_length}")

max_length=3


In [3]:
# 实现前向最大匹配分词算法
# string: 待分词的串
# word_list: 词表
# max_length: 词表中词语的最大长度
word_list = word_list_df.to_dict('list')['word']
word_list = {x:0 for x in word_list} #hash，加快检索速度
def cut(string, word_list, max_length):
    # TODO
    res = []
    while len(string):
        s = string[0:max_length]
        for i in range(len(s)-1):
            if s in word_list:
                break
            s = s[0:-1] #'前向一位'
        if s not in ['#','$','(',')',' ']: 
            res.append(s) 
        string = string[len(s):] #'消除匹配项'
    return res

In [4]:
# 执行分词（可能需要等待几分钟）
table['words'] = table['content'].apply(lambda x: ' '.join(cut(x, word_list, max_length)))
table

Unnamed: 0,ID,Poem_id,line_number,content,words
0,1,4371,-100,##餞唐永昌( 一作餞唐郎中洛陽令),餞 唐 永昌 一作 餞 唐 郎中 洛陽 令
1,2,4371,-1,$$沈佺期,沈 佺 期
2,3,4371,1,洛陽舊有( 一作出) 神明宰,洛陽 舊有 一作 出 神明 宰
3,4,4371,2,輦轂由來天地中,輦轂 由來 天地 中
4,5,4371,3,餘邑政成何足貴,餘 邑 政成 何足 貴
...,...,...,...,...,...
46272,46273,39205,-1,$$李舜弦,李 舜弦
46273,46274,39205,1,飲筵中散酒微醒,飲 筵 中 散 酒 微 醒
46274,46275,39205,2,濛濛雨草瑤階溼,濛濛 雨 草 瑤階 溼
46275,46276,39205,3,鐘曉愁吟獨倚屏,鐘 曉 愁吟 獨倚 屏


## 1.2 统计每个词的TF-IDF值

注意：本次作业实现最基础版本的TF-IDF计算即可，不必实现其他变种

In [5]:
# 按照空格分开，stack
split_words = table['words'].str.split(' ', expand=True).stack().rename('word').reset_index()
new_data = pd.merge(table['Poem_id'], split_words, left_index=True, right_on='level_0')
new_data

Unnamed: 0,Poem_id,level_0,level_1,word
0,4371,0,0,餞
1,4371,0,1,唐
2,4371,0,2,永昌
3,4371,0,3,一作
4,4371,0,4,餞
...,...,...,...,...
200779,39205,46275,4,屏
200780,39205,46276,0,盡日
200781,39205,46276,1,池邊
200782,39205,46276,2,釣


In [6]:
#cnt = new_data.groupby(['Poem_id','word']).size() #单个Poem里出现的次数
#total = new_data.groupby('Poem_id').size()  #单个Poem的总字数
total_cnt=new_data.groupby(['word']).size()

In [7]:
df = new_data
#df = pd.merge(df,cnt.rename('cnt'),on=['Poem_id','word'])
#df = pd.merge(df,total.rename('total'),on='Poem_id')
df = pd.merge(df,total_cnt.rename('total_cnt'),on=['word'])
df.drop(['level_0','level_1'],axis=1,inplace=True)
df['tf']=df['total_cnt']/new_data['word'].count() #单个词出现的总次数/词的总数
df

Unnamed: 0,Poem_id,word,total_cnt,tf
0,4371,餞,18,0.000090
1,4371,餞,18,0.000090
2,4473,餞,18,0.000090
3,4569,餞,18,0.000090
4,15110,餞,18,0.000090
...,...,...,...,...
200779,39040,髣佛,1,0.000005
200780,39092,竹錫,1,0.000005
200781,39204,青城,1,0.000005
200782,39204,物象,1,0.000005


In [8]:
df['appeared'] = 1
df.drop_duplicates(inplace=True) #消除Poem_id,word都重复的（该两个词在同一个诗里）
idf = np.log2(len(new_data.Poem_id.unique()) / df.groupby('word').appeared.count()) #idf = log{文档数/在文档中出现的次数}
df = pd.merge(df,idf.rename('idf'),on=['word'],how='left')
df['tf-idf'] = df['tf'] * df['idf']
df.drop(['appeared','Poem_id'],inplace=True,axis=1)
df.drop_duplicates(inplace=True) #消除word重复的
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,word,total_cnt,tf,idf,tf-idf
0,餞,18,0.000090,8.824304,0.000791
1,唐,191,0.000951,5.380385,0.005118
2,永昌,3,0.000015,11.326804,0.000169
3,一作,2493,0.012416,2.163574,0.026864
4,郎中,55,0.000274,7.130407,0.001953
...,...,...,...,...,...
20144,髣佛,1,0.000005,12.911766,0.000064
20145,竹錫,1,0.000005,12.911766,0.000064
20146,青城,1,0.000005,12.911766,0.000064
20147,物象,1,0.000005,12.911766,0.000064


In [9]:
# 计算TF-IDF值
# TODO
tf_idf = df[['tf-idf','word']]
tf_idf

Unnamed: 0,tf-idf,word
0,0.000791,餞
1,0.005118,唐
2,0.000169,永昌
3,0.026864,一作
4,0.001953,郎中
...,...,...
20144,0.000064,髣佛
20145,0.000064,竹錫
20146,0.000064,青城
20147,0.000064,物象
