In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

news_item = pd.read_csv("ifeng_key.csv", encoding="utf8")     # 从csv文件中读取数据
keywords = list(map(lambda x: x.split('/'), news_item["keywords"]))   # 从数据中提取关键词数据
del news_item     # 删除无用的变量
key_df = pd.DataFrame(keywords)
del keywords

In [2]:
key_df.iloc[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,脱贫,攻坚,精准,扶贫,考核,攻坚战,习近平,2017,会议,贫困
1,中非,非洲,习近平,合作,中纳,纳方,发展,中国,根哥布,纳米比亚
2,金正恩,习近平,中朝,同志,两党,总书记,和平,委员长,发展,半岛


In [3]:
def transaction_encoder(df, sparse = False):
    key_column = np.sort(np.unique(df.unstack().dropna().values))   # 提取所有的关键词作为标签并去重
    key_col_map = {}
    for (index,), item in np.ndenumerate(key_column):   # 将关键词-位置数据对存入字典中
        key_col_map[item] = index
    # TODO(SHLLL): Use sparse array instead of normal one.
    sparse = False
    indptr = [0]
    indices = []
    if(sparse):
        for row_idx, row in df.iterrows():
            for item in row.dropna().drop_duplicates():    # 去除None和重复的元素
                col_idx = key_col_map[item]
                indices.append(col_idx)
            indptr.append(len(indices))
        non_sparse_values = [1] * len(indices)
        tr_array = csr_matrix((non_sparse_values, indices, indptr), dtype=bool)
    #     df_tr = pd.SparseDataFrame(key_tr_array, columns=key_column)
    else:
        tr_array = np.zeros((df.shape[0], key_column.shape[0]), dtype=bool)
        for row_idx, row in df.iterrows():
            for item in row.dropna():     # 去除None
                col_idx = key_col_map[item]
                tr_array[row_idx, col_idx] = True
    return key_column, tr_array

In [4]:
def _generate_new_combinations(old_combinations):
    # 将输入的项集扁平化并去重保存
    item_types_in_previous_step = np.unique(old_combinations.flatten())
    # 取出原候选项集中的每一个项
    for old_combination in old_combinations:
        max_combination = max(old_combination)    # 取出该项中包含的最大列序号
        for item in item_types_in_previous_step:  # 取出当前候选集每一个对应的列序号即关键词
            if item > max_combination:
                res = tuple(old_combination) + (item,)     # 组合形成新的候选项集
                yield res
                # TODO(SHLLL): 增加枝叶修剪

In [5]:
def _find_frequent_items(df, min_support=0.5, max_len=None):
    # 这里进行了一项内容，即支持度的过滤
    X = df.values     # 取Dataframe的值存取ndarray中
    ary_col_idx = np.arange(X.shape[1])     # 创建一个列索引序列
    support = (np.sum(X, axis=0) / float(X.shape[0]))      # 计算每个关键词出现的概率即支持度
    support_dict = {1: support[support > min_support]}     # 将大于最小支持度的支持度存入字典
    # 这里进行了支持度过滤即由候选集C1生成了频繁项集L1并存入字典中
    itemset_dict = {1: ary_col_idx[support > min_support].reshape(-1, 1)}  # 取出支持度对应的编号
    max_itemset = 1
    rows_count = X.shape[0]
    
    if max_len == None:
        max_len = float('inf')      # 设置max_len为无穷大
    
    while max_itemset and max_itemset < max_len:
        next_max_itemset = max_itemset + 1
        combin = _generate_new_combinations(itemset_dict[max_itemset])
        frequent_items = []
        frequent_items_support = []
        
        for c in combin:
            together = X[:, c].all(axis=1)
            support = together.sum() / rows_count    # 计算当前项的支持度
            if support >= min_support:    # 提取当前候选项集中的频繁项
                frequent_items.append(c)
                frequent_items_support.append(support)
        
        if frequent_items:    # 如果找到了频繁项则将对应的项和支持度存入字典
            itemset_dict[next_max_itemset] = np.array(frequent_items)
            support_dict[next_max_itemset] = np.array(frequent_items_support)
            max_itemset = next_max_itemset
        else:     # 如果没有频繁项则表示当前已无可供寻找的频繁项
            max_itemset = 0
    return itemset_dict, support_dict

In [6]:
def apriori(df, min_support=0.5, use_colnames=False, find_rules=False, max_len=None):
    itemset_dict, support_dict = _find_frequent_items(df, min_support, max_len)
    if find_rules:
        _find_asso_rules(df, itemset_dict, support_dict)
    
    all_fre = []
    for k in sorted(itemset_dict):    # 取出字典中的key
        support = pd.Series(support_dict[k])    # 取出对应的支持度
        itemsets = pd.Series([i for i in itemset_dict[k]])   # 取出对应的关键词标号
        res = pd.concat((support, itemsets), axis = 1)   # 横向拼接两个Series为Dataframe
        all_fre.append(res)        # 将所有的Dataframe存入到List中
    
    fre_df = pd.concat(all_fre)    # 纵向拼接所有的Dataframe
    fre_df.columns = ["support", "itemsets"]   # 为数据起一个标题
    if use_colnames:
        mapping = {idx: item for (idx,), item in np.ndenumerate(df.columns)}  # 创建一个索引--关键词名的mapping
        fre_df["itemsets"] = fre_df["itemsets"].apply(lambda x: [mapping[i] for i in x])
    fre_df = fre_df.reset_index(drop=True)
    return fre_df

In [10]:
def _find_asso_rules(df, itemset_dict, support_dict):
    pass
    
#     for k in sorted(itemset_dict)[1:]:      # 从频繁二项集开始遍历

In [None]:
ckey_column, key_tr_array = transaction_encoder(key_df)
df_tr = pd.DataFrame(key_tr_array, columns=key_column)
# df_tr.loc[:, ("中非", "非洲", "习近平")].head()
res_df = apriori(df_tr, min_support=0.03, find_rules=True, max_len=4)       # 提取出现一次以上的关键词
# res_df

In [None]:
ind = pd.Index([[1], [2], [3], [4], [5]])
ser = pd.Series([1, 2, 3, 4, 5], index=ind)
lis = []
lis.append(ser)
ind2 = pd.Index([[1,1], [2,2], [3,3], [4,4], [5,5]])
ser2 = pd.Series([1, 2, 3, 4, 5], index=ind2)
lis.append(ser2)
ss = pd.concat(lis)
print(ss)