In [1]:
# 计算句子的文本特征

In [1]:
%matplotlib inline
from __future__ import division
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
# pd.options.display.max_rows = 12
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12, 4))
pd.set_option('display.max_columns', None) #输出所有列

def set_ch():
    from pylab import mpl
    #中文字体问题已经解决，记录在evernote中
    #mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
set_ch()
from datetime import datetime,timedelta
import matplotlib
matplotlib.style.use('ggplot')
import flatten_json
import re
import time
import math
from gensim import corpora
from gensim import models
import nltk
import Levenshtein
import readability
import syllables_en

In [2]:
textList = list(pd.read_csv('data/audio_for_scoring_RUI.csv').text)

In [3]:
def preProcessing(tl):
    """
    Pre Processing the text.    
    Parameters
    ----------
    tl : list
        eg
        ['The plane is on the table.',
         'The bike is next to the toy box.',...]
         
    Returns
    -------
    textList_tokenized: list
        tokenized sentence
    word_corpus: list
        word vectors to represent the sentence
    wordList: list
        word list, all the word have occured 
    df_myCorpu: DataFrame
        word dataframe
    """
    
    replace_pattern = [
    (r'let\'s', 'let us'),
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'isn\'t', 'is not'),
    (r'aren\'t', 'are not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
    ]

    # 规则化——通过正则替换缩略词
    def replace(text):
        newTextList = []
        for (pattern, repl) in replace_pattern:
            text_sub = re.sub(pattern,repl,text)
            if text_sub != text :
                final_text = text_sub
                break
            else :
                final_text = text_sub
        newTextList.append(final_text)
        return (newTextList)
    
    # 替换textList中所有缩略词（一个句子中最多存在两个缩略词）
    # newTextList, 第一次缩略词替换
    # newTextList2, 第二次缩略词替换
    newTextList =[]
    for i in range(len(tl)):
        newTextList.append(replace(tl[i].lower()))
    newTextList2 =[]
    for i in range(len(newTextList)):
        newTextList2.append(replace(newTextList[i][0]))
        
    # 过滤标点符号和非单词字符
    newTextList3 = []
    for i in range(len(newTextList2)):
        new_re = re.sub('[,.?...!:;\'\[\]\(\)]','',newTextList2[i][0])
        new_re = re.sub('<\d*>','',new_re)
        new_re = re.sub('-',' ',new_re)
        newTextList3.append(new_re)
        
    #分词
    textList_tokenized = []
    for i in range(len(newTextList3)):
        textList_tokenized.append(nltk.word_tokenize(newTextList3[i]))
        
    # 建立字典
    # word_dict 存储当前词典
    # textList_tokenized转换为词向量
    word_dict = corpora.Dictionary(textList_tokenized)
    word_corpus = [word_dict.doc2bow(i) for i in textList_tokenized]
    
    # df_myCorpus 存放所有出现过的单词
    # wordList 便于计算
    wordList = []
    for i in range(len(word_dict)):
        wordList.append(word_dict[i])
    df_myCorpus = DataFrame(wordList)
    df_myCorpus.columns = ['word']
    
    return (textList_tokenized, word_corpus, wordList, df_myCorpus, newTextList3)

In [4]:
textList_tokenized, word_corpus, wordList, df_myCorpus, newTextList = preProcessing(textList)

In [5]:
class calc_termFeature():
    """
    calculate the term features()
    """
    def __init__(self, textList_tokenized, word_corpus, wordList, df_myCorpu):
        self.textList_tokenized = textList_tokenized
        self.word_corpus = word_corpus
        self.wordList = wordList
        self.df_myCorpus = df_myCorpus

    def phone(self):
        # df_cmu，存储cmu pronuncing dictionary中的word 和 phone的标注
        # df_myCorpus_na，存储CMU字典中没有标注的单词，之后使用levenshtein来替换
        cmu_dic = nltk.corpus.cmudict.entries()
        cmu_word = []
        cmu_phone = []
        for i in range(len(cmu_dic)):
            cmu_word.append(str(cmu_dic[i][0]))
            cmu_phone.append(cmu_dic[i][1])
        df_cmu = DataFrame()
        df_cmu['word'] = cmu_word
        df_cmu['phone'] = cmu_phone
        df_myCorpus_phone = pd.merge(self.df_myCorpus, df_cmu, how='left', on='word')
        df_myCorpus_na = df_myCorpus_phone[df_myCorpus_phone.phone.isnull() == True]
        word_need2be_replace = list(df_myCorpus_phone[df_myCorpus_phone.phone.isnull() == True].word)
        best_replace_word = []
        for i in range(len(word_need2be_replace)):
            best_dis = len(word_need2be_replace[i])+5
            best_word = word_need2be_replace[i]
            for j in range(len(cmu_word)):
                dis = Levenshtein.distance(str(word_need2be_replace[i]), cmu_word[j])
                if dis < best_dis:
                    best_dis = dis
                    best_word = cmu_word[j]
            best_replace_word.append(best_word)
        # drop掉原df_myCorpus_phone没有标记的单词
        dropList = list(df_myCorpus_na.index)
        for idx in dropList:
            df_myCorpus_phone = df_myCorpus_phone.drop([idx],axis=0)
        # cmu_dict 存储cmu的
        cmuDic = {}
        for i in range(len(cmu_word)):
            cmuDic[cmu_word[i]] = cmu_phone[i]
        # 新建一个字典，字典的key为新的替代词语，值为新的替代词语在mrc中的指标
        word_need2be_replaceDic = {}
        for word in best_replace_word:
            word_need2be_replaceDic[word] = cmuDic[word]
        df_replaced_word = DataFrame()
        word = list(df_myCorpus_na.word)
        values = word_need2be_replaceDic.values()
        df_replaced_word['word'] = word
        df_replaced_word['phone'] = values
        df_phone = pd.concat([df_myCorpus_phone, df_replaced_word]).reset_index().drop(['index'],axis=1)
        return df_phone

    def num_of_phones(self):
        # 计算phone个数
        df_nphs = DataFrame()
        df_phone = self.phone()
        phone_list = []
        for i in range(len(df_phone)):
            phone_list.append(len(df_phone.phone[i])) 
        df_nphs['word'] = df_phone.word    
        df_nphs['nphone'] = phone_list
        return df_nphs

    def num_of_sylables(self):
        # 计算音节个数
        df_nsyl = DataFrame()
        def count_syllables(words):
            # 方法1
            syllableCount = 0
            for word in words:
                syllableCount += syllables_en.count(word)
            return syllableCount
        
        syl_list = []
        for i in range(len(self.wordList)):
            syl_list.append(count_syllables(wordList[i]))
        df_nsyl['word'] = wordList
        df_nsyl['nsyl'] = syl_list
        return df_nsyl

    def tf_idf(self):
        # tf-idf体现了一个词在不同文本中的区分度
        # 每个单词在不同的句子中tf-idf不一样，因此计算其平均tf-idf
        # word_tfidf_dict存储了每个单词在这一堆文本中的tf-idf
        tfidf  = models.TfidfModel(self.word_corpus)
        corpus_tfidf = tfidf[self.word_corpus]
        word_tfidf_dict = {}
        for sentence in corpus_tfidf:
            for word in sentence:
                word_tfidf_dict[word[0]] = word[1]
        corpus_tfidf = tfidf[corpus_tfidf]
        all_tfidf = []
        for j in range(len(wordList)):
            sum_tfidf = 0
            num_tfidf = 0
            for doc in corpus_tfidf:
                for i in range(len(doc)):
                    if doc[i][0] == j:
                        sum_tfidf += doc[i][1]
                        num_tfidf += 1
            all_tfidf.append(sum_tfidf / num_tfidf)
        df_tfidf = DataFrame()
        df_tfidf['word'] = self.wordList
        df_tfidf['mean_tfidf'] = all_tfidf
        return df_tfidf

    def idf(self):
        # 计算词语的信息量
        # idf_li：每个单词的idf
        # doc_li：每个单词出现的文档数
        idf_li = []
        doc_li = []
        for word in self.wordList:
            doc_num = 0
            n = len(self.textList_tokenized)
            for sentence in self.textList_tokenized:
                if word in sentence:
                    doc_num +=1
            doc_li.append(doc_num)
            idf_li.append(math.log((1+n)/(1+doc_num)))
        df_idf = DataFrame()
        df_idf['word'] = self.wordList
        df_idf['idf'] = idf_li
        return df_idf
        
    def POS_tag(self):
        # 利用nltk进行词性标注
        df_pos = DataFrame(nltk.pos_tag(wordList))
        df_pos.columns = ['word', 'posTag']
        return df_pos

    def termFrequency_BNCSE(self):
        # 计算单词在British National Corpus（BNC）spoken english中的词频
        f_bncSpk = open('corpus/BNC_SPOKEN_wordlist.txt')
        fileContent_bncSpk = f_bncSpk.readlines()
        fileContent_bncSpk = fileContent_bncSpk[3:]
        bncSpk_dict = {}
        for rec in fileContent_bncSpk:
            new_re = rec.split('\t')
            bncSpk_dict[new_re[2]] = int(new_re[1])
        df_bncSpk = pd.DataFrame.from_dict(bncSpk_dict, orient='index')
        df_bncSpk = df_bncSpk.reset_index()
        df_bncSpk.columns = ['word', 'bncSpkFreq']
        return df_bncSpk

    def num_of_word(self):
        # 计算单词长度
        df_nlet = DataFrame()
        nlet = []
        for i in range(len(self.wordList)):
            nlet.append(len(self.wordList[i]))
        df_nlet['word'] = self.wordList
        df_nlet['nlet'] = nlet
        return df_nlet

    def mean_posi(self):
        # 词语的位置信息
        mean_posi = []
        for word in self.wordList:
            word_posi = []
            for sentence in self.textList_tokenized:
                for i in range(len(sentence)):
                    if sentence[i] == word:
                        word_posi.append(i+1)
            mean_posi.append(sum(word_posi)/len(word_posi))
        df_posi = DataFrame()
        df_posi['word'] = self.wordList
        df_posi['mean_posi'] = mean_posi     
        return df_posi
    
    def features(self, senFea=False):
        # senFea默认为False，计算所有词语特征；否则的话计算句子需要的特征
        # all term features
        if senFea==True:
            df_phone = self.phone()
            df_nsyl = self.num_of_sylables()
            df_idf = self.idf()
            df_bncSpk = self.termFrequency_BNCSE()
            df_nlet = self.num_of_word()
            df_result = pd.merge(df_phone,df_nsyl,how='left',on='word')
            df_result = pd.merge(df_result,df_idf,how='left',on='word')
            df_result = pd.merge(df_result,df_bncSpk,how='left',on='word')
            df_result = pd.merge(df_result,df_nlet,how='left',on='word')
        else:
            df_phone = self.phone()
            df_nphs = self.num_of_phones()
            df_nsyl = self.num_of_sylables()
            df_idf = self.idf()
            df_pos = self.POS_tag()
            df_bncSpk = self.termFrequency_BNCSE()
            df_nlet = self.num_of_word()
            df_posi = self.mean_posi()
            df_result = pd.merge(df_phone,df_nphs,how='left',on='word')
            df_result = pd.merge(df_result,df_nsyl,how='left',on='word')
            df_result = pd.merge(df_result,df_idf,how='left',on='word')
            df_result = pd.merge(df_result,df_pos,how='left',on='word')
            df_result = pd.merge(df_result,df_bncSpk,how='left',on='word')
            df_result = pd.merge(df_result,df_nlet,how='left',on='word')
            df_result = pd.merge(df_result,df_posi,how='left',on='word')
        return df_result

In [6]:
ct = calc_termFeature(textList_tokenized, word_corpus, wordList, df_myCorpus)

In [11]:
res = ct.features(senFea=True)

In [8]:
class calc_sentenceFeatures():
    '''
    calculate sentence features
    '''
    df_sentence = DataFrame()
    def __init__(self, df_term, textList_tokenized, newTextList, word_corpus):
        self.df_term = df_term
        self.textList_tokenized = textList_tokenized
        self.newTextList = newTextList
        self.word_corpus = word_corpus
        # 建立term的词典
        self.term_dic = {}
        for i in range(len(df_term)):
            self.term_dic[df_term.word[i]] = df_term.values[i][1:]

    def baseOnTermFeatures(self):
        # 计算句子特征（排除posTag和discourse_field）
        sentS_score = []
        for sentence in textList_tokenized:
            sentList = []
            for word in sentence:
                sentList.append(self.term_dic[word])
            df_middle = pd.DataFrame(sentList)
            df_middle.columns = self.df_term.columns[1:]
            sent_score = []
            # 长度，计算句子mean单词长度
            sent_score.extend(list(df_middle[['nlet']].mean()))
            # 音素数，音节数，计算句子的sum，mean
            sent_score.extend(list(df_middle[['nsyl']].sum()))
            sent_score.extend(list(df_middle[['nsyl']].mean()))
            # 词频，没用mean(极端值造成严重影响)
            sent_score.extend(list(df_middle[['bncSpkFreq']].median()))  
            # 句子的平均idf
            sent_score.extend(list(df_middle[['idf']].mean()))
            sentS_score.append(sent_score)
            
        # 处理
        df_baseFeatures = DataFrame(sentS_score)
        df_baseFeatures.columns = ['mean_nlet','sum_nsyl','mean_nsyl','median_bncSpkFreq','mean_idf']
        # 将index设为句子
        df_baseFeatures.index = self.newTextList
        # 句子中的单词数
        nterm = []
        for i in range(len(self.textList_tokenized)):
            nterm.append(len(self.textList_tokenized[i]))
        df_baseFeatures['nterm'] = nterm
        # 除去仅一个单词的句子
        #df_baseFeatures = df_baseFeatures[df_baseFeatures.nterm != 1]
        # 去重
        #df_baseFeatures = df_baseFeatures.drop_duplicates()
        # 轴向转换
        df_baseFeatures['median_bncSpkFreq'] = df_baseFeatures.median_bncSpkFreq.values * (-1)
        return df_baseFeatures
    
    def type_token_ratio(self):
        ratio_li = []
        for sent in self.word_corpus:
            types = len(sent)
            tokens = 0
            for word in sent:
                tokens = tokens + word[1]
            ratio = types / tokens
            ratio_li.append(ratio)
        df_ttr = DataFrame(ratio_li)
        df_ttr.index = newTextList
        df_ttr.columns = ['type_token_ratio']
        return df_ttr
    
    def read_formula(self):
        read_li = []
        for text in self.newTextList:
            read_dic = {}
            read = readability.Readability(text)
            read_dic['ARI'] = read.ARI()
            read_dic['FleschReadingEase'] = read.FleschReadingEase()
            read_dic['FleschKincaidGradeLevel'] = read.FleschKincaidGradeLevel()
            read_dic['GunningFogIndex'] = read.GunningFogIndex()
            read_dic['SMOGIndex'] = read.SMOGIndex()
            read_dic['ColemanLiauIndex'] = read.ColemanLiauIndex()
            read_dic['LIX'] = read.LIX()
            read_dic['RIX'] = read.RIX()
            read_li.append(read_dic)
        df_formula = DataFrame(read_li)
        df_formula.index = newTextList
        return df_formula
    
    def all_features(self):
        df_baseFeatures = self.baseOnTermFeatures()
        df_ttr = self.type_token_ratio()
        df_formula = self.read_formula()
        df_sentence = pd.concat([df_baseFeatures, df_ttr], axis=1)
        df_sentence = pd.concat([df_sentence, df_formula], axis=1)
        return df_sentence

In [12]:
cs = calc_sentenceFeatures(res, textList_tokenized, newTextList, word_corpus)

In [14]:
cs.all_features()

Unnamed: 0,mean_nlet,sum_nsyl,mean_nsyl,median_bncSpkFreq,mean_idf,nterm,type_token_ratio,ARI,ColemanLiauIndex,FleschKincaidGradeLevel,FleschReadingEase,GunningFogIndex,LIX,RIX,SMOGIndex
year,4.000000,2,2.000000,-4396.0,6.137607,1,1.000000,-2.090000,-22.2400,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
race,4.000000,1,1.000000,-113.0,6.543072,1,1.000000,-2.090000,-22.2400,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
town,4.000000,1,1.000000,-639.0,7.236219,1,1.000000,-2.090000,-22.2400,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
crowd,5.000000,1,1.000000,-49.0,7.929367,1,1.000000,2.620000,-16.3500,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
stand,5.000000,1,1.000000,-655.0,6.725394,1,1.000000,2.620000,-16.3500,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
exciting,8.000000,2,2.000000,-99.0,6.948537,1,1.000000,16.750000,1.3200,20.2000,-47.9800,40.4000,101.000000,1.0,8.477226
just,4.000000,1,1.000000,-15078.0,7.082069,1,1.000000,-2.090000,-22.2400,-3.4000,121.2200,0.4000,1.000000,0.0,3.000000
finish,6.000000,2,2.000000,-328.0,6.830754,1,1.000000,7.330000,-10.4600,8.4000,36.6200,0.4000,1.000000,0.0,3.000000
winner,6.000000,1,1.000000,-41.0,7.641684,1,1.000000,7.330000,-10.4600,8.4000,36.6200,0.4000,1.000000,0.0,3.000000
behind,6.000000,1,1.000000,-508.0,6.255390,1,1.000000,7.330000,-10.4600,8.4000,36.6200,0.4000,1.000000,0.0,3.000000
