In [160]:
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import pickle as pk
from collections import Counter
import os
import pickle
import jieba
import re
from collections import Counter

# Bdc其实是一种有监督的词袋模型
class Featuers:
    
    def __init__(self, k=80, ngram_range=(1, 3), _min=0.0005, _max=0.95, label=None, Bdc=True,
                 role='agent', debug=True, _maxfeature=50000):
        """初始化计算过程中的参数
        Parameters
        ----------
        k : int >= 1, default=80
            将稀疏矩阵转化成一般矩阵的速率
        ngram_range: tuple(min_n, max_n), default=(1,2)
            ngram的上下界,从文档中选择包含n个词的token,min_n<=n<=max_n

        label : string or int or None, default=None
            string or int表示的计算的是二分类的BDC值(依赖于你的类别标注),None表示计算的是多分类的BDC值
        Bdc : boolean, default=True
            True:采用论文中的公式计算二分类Bdc值,False:采用新公式计算BDC值
        role : string in ['agent', 'all', 'user'], default=agent
            'agent':只采用客服的对话,'all':采用所有的对话,'user':采用用户的对话
        debug:  boolean, default=True
            True:采用debug摸式,在debug模式下不允许从保存的中间文件中读取模型的中间结果
        """
        self.k, self.ngram = k, ngram_range
        self._min, self._max, self._maxfeature = _min, _max, _maxfeature
        self.label, self.Bdc, self.role, self.debug = label, Bdc, role, debug


    def getDF(self, data, labels):
        """ 将原始语料集矩阵表示
        Parameters
        ----------
        data : list like [string,...]
            语料集,每一单元是一篇文档.['This is the first document.','This is the second second document.',\
            'And the third one.','Is this the first document?',]
        labels : list or numpy.array, like [int,...]
            语料集对应的类别标注
        Returns
        -------
        df : pd.DataFrame
            token矩阵, index is token_id(int64), columns is label(string)
        vocab : dict
        token_id到token的映射词典, 形如{token_id:token,...}
        """
        vec = CountVectorizer(ngram_range=self.ngram, min_df=self._min, max_df=self._max)
        data = vec.fit_transform(data)
        vocab = {j: i for i, j in vec.vocabulary_.items()} # id2token
        
        _label = np.unique(labels)
        labels_token = {} # <dict>{str:[list]}
        k = self.k # 稀疏矩阵转化的速率
        for i in tqdm(range(0, len(labels), k)):
            if i+k >= len(labels):
                # 最后 不足k的部分
                temp = labels[i:]
                temp_data = data[i:]
            else:
                temp = labels[i:i+k]
                temp_data = data[i:i+k]
#             print(temp, temp_data)
            for _ in _label:
                labels_token[_] = labels_token.get(_, 0)
                labels_token[_] += temp_data[np.array(temp) == _].toarray().sum(axis=0)
        del(i, _, data, vec, labels, _label) # 防止内存泄露
        return pd.DataFrame(labels_token), vocab
    

    def calBdc(self, data, labels):
        """ 将原始语料集矩阵表示
        Parameters
        ----------
        data : list like [string,...]
            语料集,每一单元是一篇文档.['This is the first document.','This is the second second document.',\
            'And the third one.','Is this the first document?',]
        labels : list or numpy.array, like [int,...]
            语料集对应的类别标注
        Returns
        -------
        df : pd.DataFrame
            word_bdc矩阵, index is tokens, columns is ['TF','BDC']
        """
        if not os.path.exists('setting'):
            os.makedirs('setting')
        # if not self.debug and os.path.exists('setting/{}data_vocab.pk'.format(self.role)):
        #     with open('setting/{}data_vocab.pk'.format(self.role), 'rb') as fr:
        #         df, vocab = pk.load(fr)
        # else:
        df, vocab = self.getDF(data, labels)
        # with open('setting/{}data_vocab.pk'.format(self.role), 'wb') as fw:
        #     pk.dump([df, vocab], fw)
        labels_counter = Counter(labels)
        label_list = [labels_counter[i] for i in df.columns]
        # print('待计算bdc值的数据label为：', list(df.columns))
        # 扩展， 如果单纯的计算bdc可以将下列判断部分注释
        if self.label and self.Bdc: # 二分类bdc值
            label_list = [labels_counter[self.label], sum(labels_counter.values())]
            label_list[-1] -= label_list[0]
            df['negative'] = df.sum(axis=1) - df[self.label]
            df = df[[self.label, 'negative']]
        
        elif self.label and not self.Bdc: # 使用新公式计算二分类bdc值
            x = sum(labels_counter.values()) - labels_counter[self.label]
            label_list = [x if i != df.columns.index(self.label)\
             else j for i, j in enumerate(label_list)]
            
        assert len(label_list) == len(df.columns)

        # 计算Bdc
        temp_df = (df/label_list).apply(lambda x: x/sum(x), axis=1).applymap(lambda x: 0 if x==0 else x*np.log2(x))
        df['TF'] = df.sum(axis=1)
        df['BDC'] = round(temp_df.sum(axis=1)/np.log2(len(label_list)), 4) + 1
        df['Tokens'] = [vocab[i] for i in df.index]
        df.set_index(['Tokens'], inplace=True)
        # df.to_csv('setting/{}_{}.csv'.format(self.role, self.label))
        return df, vocab
# 停用词
def get_stopwords():
    with open('./stopword.csv', 'r', encoding='utf-8') as f:
        stopwords = f.read()
    return stopwords

def pickleLoad(path):
    obj = None
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

def pickleDump(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [52]:
# 添加自定义词典
jieba.load_userdict("./game_dict.txt")
i = 8
d = pd.read_csv('./topic_{}_url_comments.csv'.format(i), usecols=['comment', 'topic'])
corpus = d.comment.values.tolist()
labels = d.topic.values.tolist()

# 分词
comments_words = []
for comm in tqdm(corpus):
    seg_list = jieba.cut(comm)  # 默认是精确模式
    comments_words.append(' '.join(seg_list))


# 去停用词
stopwords = get_stopwords()
stopwords = stopwords + '�/n'
new_comments_words = []
for comm_words in tqdm(comments_words):
    new_comm_words = []
    for word in comm_words.strip().split():
        if word not in stopwords:
            new_comm_words.append(word)
    new_comments_words.append(' '.join(new_comm_words))

# # 利用sklearn countvectorizer 或取文档的bow表示(稀疏矩阵)
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(new_comments_words) #(75000, 29428)
# # pickleDump(vectorizer, './tf_vectorizer.pickle')
# X.shape
Bdc = Featuers(ngram_range=[1, 1], _min=0.0005, _max=0.95)
df, vocab = Bdc.calBdc(new_comments_words, labels)
df = df.sort_values('BDC', ascending=False)
df.to_csv('./bdc_result_topic_{}_nojieba.csv'.format(i))

100%|██████████████████████████████████████████████████████████████████████████| 75000/75000 [00:24<00:00, 3008.95it/s]
100%|█████████████████████████████████████████████████████████████████████████| 75000/75000 [00:04<00:00, 18556.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 938/938 [00:03<00:00, 310.77it/s]


In [56]:
# 出现次数最多的主题作为该词的主题，
df['word_topic'] = np.argmax(df[[0, 1, 2, 3, 4, 5, 6, 7]].values, axis=1)
# 选择游戏主题下的词
game_topic_word = df[df.word_topic == 1]
game_topic_word.to_csv('topic_{}_youxi_key_words.csv'.format(i))

In [314]:
game_topic_word = df[df.word_topic == 1]
game_topic_word

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,TF,BDC,word_topic
Tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
守望先锋,18,33,0,2,0,0,0,1,54,0.5881,1
穿越火线,3,103,3,3,0,3,4,23,142,0.5600,1
不在话下,5,39,0,5,0,8,1,1,59,0.5278,1
飞车,10,28,0,0,0,3,2,7,50,0.4711,1
再配,9,12,0,0,11,3,3,1,39,0.4483,1
守望,15,17,0,2,0,0,3,2,39,0.4457,1
不玩,39,79,1,4,0,3,6,12,144,0.4380,1
打打,16,62,0,3,2,4,2,7,96,0.4322,1
全高,18,30,0,2,3,0,0,1,54,0.4322,1
小游戏,14,105,0,6,2,2,13,34,176,0.4181,1


In [315]:
game_topic_words = game_topic_word.index.values.tolist()

In [77]:
# 所有的评论信息
topic_comments = pd.read_csv('./topic_8_url_comments.csv'.format(i))
# 主题为游戏的评论
game_topic_comments = topic_comments[topic_comments.topic == 1]

In [251]:

# 筛出包含主题词的评论
game_word_config_df = pd.DataFrame(columns=['game_words', 'cpu_cnt', 'mem_cnt', 'gpu_cnt', 'comm_config'])

for game_word in game_topic_words:
    related_comments = game_topic_comments.apply(lambda x: True if  game_word in x['comment'] else False, axis=1)
    _temp_comments = game_topic_comments[related_comments]
    _temp_comments.fillna("", inplace=True)
    _temp_comments['comments_info'] = _temp_comments['colors'] + _temp_comments['size']
    _temp_comments['url_info'] = _temp_comments['attrs'] + _temp_comments['more_attrs']
    comments_info = _temp_comments['comments_info'].values.tolist()
    url_info = _temp_comments['url_info'].values.tolist()
    
    # 统计配置信息
    _CPUs = []
    _Memorys = []
    _GPUs = []
    for str1, str2 in zip(comments_info, url_info):
        _CPU, _Memory, _GPU = search_config(str1, str2)
        if _CPU is not None:
            _CPUs.append(_CPU)
        if _Memory is not None:
            _Memorys.append(_Memory)
        if _GPU is not None:
            _GPUs.append(_GPU)
    cpu = get_most_comm(_CPUs)
    mem = get_most_comm(_Memorys)
    gpu = get_most_comm(_GPUs)
    
    cpu_cnt = get_comm(_CPUs)
    mem_cnt = get_comm(_Memorys)
    gpu_cnt = get_comm(_GPUs)
    
    game_word_config_df.loc[game_word_config_df.shape[0]+1] = {'game_words':game_word, 'cpu_cnt':str(cpu_cnt), 'mem_cnt':str(mem_cnt), 'gpu_cnt':str(gpu_cnt), 
              'comm_config': ' '.join([cpu, mem, gpu])}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [311]:
def calc_ent(x):
    """
        calculate shanno ent of x
    """
    x = np.array(x)
    x_value_list = set([x[i] for i in range(x.shape[0])])
#     print(x_value_list)
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp

    return ent

# 筛出包含主题词的评论
game_word_config_df = pd.DataFrame(columns=['game_words', 'cpu_cnt', 'mem_cnt', 'gpu_cnt', 'comm_config'])

for game_word in game_topic_words:
    related_comments = game_topic_comments.apply(lambda x: True if  game_word in x['comment'] else False, axis=1)
    _temp_comments = game_topic_comments[related_comments]
    _temp_comments.fillna("", inplace=True)
    _temp_comments['comments_info'] = _temp_comments['colors'] + _temp_comments['size']
    _temp_comments['url_info'] = _temp_comments['attrs'] + _temp_comments['more_attrs']
    comments_info = _temp_comments['comments_info'].values.tolist()
    url_info = _temp_comments['url_info'].values.tolist()
    
    # 统计配置信息
    _CPUs = []
    _Memorys = []
    _GPUs = []
    for str1, str2 in zip(comments_info, url_info):
        _CPU, _Memory, _GPU = search_config(str1, str2)
        if _CPU is not None:
            _CPUs.append(_CPU)
        if _Memory is not None:
            _Memorys.append(_Memory)
        if _GPU is not None:
            _GPUs.append(_GPU)
    cpu = get_most_comm(_CPUs)
    mem = get_most_comm(_Memorys)
    gpu = get_most_comm(_GPUs)
    
    cpu_ent = calc_ent(_CPUs)
    mem_ent = calc_ent(_Memorys)
    gpu_ent = calc_ent(_GPUs)
    
    game_word_config_df.loc[game_word_config_df.shape[0]+1] = {'game_words':game_word, 
                                                               'cpu_cnt':str(cpu_ent), 
                                                               'mem_cnt':str(mem_ent), 
                                                               'gpu_cnt':str(gpu_ent), 
                                                               'comm_config': ' '.join([cpu, mem, gpu])}

In [278]:
game_word_config_df.to_csv('./game_words_comm_config_ent.csv', index=False)
game_word_config_df.sort_values('gpu_cnt',ascending=True).to_csv('./game_words_comm_config_sort_gpu.csv', index=False)