In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 首先对商品和人购买的情况进行统计，过滤出热门的商品或者是人，同时统计出商品被购买的次数（或者是人购买的次数）
from tqdm import tqdm
import csv

def statisitc_counter(path):
    articles_number_dict = {}
    customer_number_dict = {}
    
    csv_reader = csv.reader(open(path))
    for item in tqdm(csv_reader):
        if item[1] not in customer_number_dict.keys():
            customer_number_dict[item[1]]  = 1
        else:
            customer_number_dict[item[1]] += 1
        
        if item[2] not in articles_number_dict.keys():
            articles_number_dict[item[2]] = 1
        else:
            articles_number_dict[item[2]] += 1
    
    return articles_number_dict,customer_number_dict

In [None]:
# 整个过程大概需要三分多钟

articles_number_dict,customer_number_dict = statisitc_counter("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
sorted_articles_number_dict = sorted(articles_number_dict.items(), key=lambda x: x[1], reverse=True)
sorted_customer_number_dict = sorted(customer_number_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
# 有被购买过的商品的总数是104548
# 购买过商品的人的总数是1362282

print(len(sorted_articles_number_dict))
print(len(sorted_customer_number_dict))

In [None]:
# 绘制一下两者的折线图

articles_list,articles_number_list = zip(*sorted_articles_number_dict)
customer_list,customer_number_list = zip(*sorted_customer_number_dict)

In [None]:
# 建一个文件夹保存图片，以方便撰写报告
import os
try:
    os.mkdir("./figures")
except:
    print("exist")

In [None]:
# 先来绘制一下针对商品的被购买次数-排名位次折线图
# 这张图可以在Output的figures文件夹下找到
import matplotlib.pyplot as plt
from pylab import *   

x = range(len(articles_number_list))
y = articles_number_list
plt.title("Distribution of purchased articles") #标题
plt.xlabel(u"articles") #X轴标签
plt.ylabel("purchased times") #Y轴标签
plt.plot(x, y,label="articles")
plt.savefig("./figures/Distribution of purchased articles.jpg")
plt.show()

In [None]:
# 然后是消费者消费的次数分布
import matplotlib.pyplot as plt
from pylab import *   

x = range(len(customer_number_list))
y = customer_number_list
plt.title("Distribution of consume times") #标题
plt.xlabel(u"customer") #X轴标签
plt.ylabel("consume times") #Y轴标签
plt.plot(x, y,label="customer")
plt.savefig("./figures/Distribution of consume times.jpg")
plt.show()

In [None]:
# 可以看到，其基本是满足长尾分布的

In [None]:
# 然后，我们来统计一下前%k的商品或者消费者的相关次数

def show_top_proportion_k(target_list,total_proportion = 10,step = 10):
    # 本函数默认显示前5%的相关次数，精度默认按照0.1%的来
    length = len(target_list)
    
    for i in range(0,total_proportion * step):
        print("number of top {}% is:{}".format(i/step,target_list[int(length * i /(step * 100))]))

In [None]:
# 看看商品被购买数量的分布情况
show_top_proportion_k(articles_number_list)

In [None]:
# 看看用户购买数量的分布情况
show_top_proportion_k(customer_number_list)

In [None]:
# 之后，我们分别计算占所有被购买商品50%的商品数量以及购买了50%商品的人的数量，并且将它们作为数据的高频部分

def count_proportion(target_list,proportion = 0.5):
    sum = 0 
    length = len(target_list)
    for i in range(length):
        sum += target_list[i]
    target_sum = sum * proportion
    sum = 0
    for i in range(length):
        sum += target_list[i]
        if sum > target_sum:
            return i

In [None]:
articles_count = count_proportion(articles_number_list)
customer_count = count_proportion(customer_number_list,proportion = 0.1)


# 这里因为用户的数量比较多，为了将筛选出来的商品和用户的数量都控制在一万及一下
print("占商品购买次数50%的商品index是{}，占比{}".format(articles_count,articles_count/len(articles_number_list)))
print("占用户购买次数10%的用户index是{}，占比{}".format(customer_count,customer_count/len(customer_number_list)))

In [None]:
filtered_articles_set = set(articles_list[0:6800])
filtered_customer_set = set(customer_list[0:10946])

In [None]:
try:
    os.mkdir("./filtered_files")
except:
    print("exist")

In [None]:
articles_csv_reader = csv.reader(open("../input/h-and-m-personalized-fashion-recommendations/articles.csv"))
customer_csv_reader = csv.reader(open("../input/h-and-m-personalized-fashion-recommendations/customers.csv"))

filtered_articles_csv_writer = csv.writer(open("./filtered_files/filtered_articles.csv","w"))
filtered_customer_csv_writer = csv.writer(open("./filtered_files/filtered_customer.csv","w"))

for item in tqdm(articles_csv_reader):
    if item[0] in filtered_articles_set:
        filtered_articles_csv_writer.writerow(item)
for item in tqdm(customer_csv_reader):
    if item[0] in filtered_customer_set:
        filtered_customer_csv_writer.writerow(item)

In [None]:
# 在使用关联规则挖掘之前，我们先来进行一些统计性的分析

# 例如说，统计一下每一种特征出现的频率和它们在原有数据当中出现的频率，并且对比之

In [None]:
# 首先来构造一个统计属性-index的函数

def create_attri_index_dict(path):
    csv_reader = csv.reader(open(path))
    for head in csv_reader:
        break
    attri_index_dict = {}
    index_attri_dict = {}
    for i,attri in enumerate(head):
        attri_index_dict[attri] = i
        index_attri_dict[i] = attri
        
    return attri_index_dict,index_attri_dict

article_attri_index_dict,article_index_attri_dict = create_attri_index_dict("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customer_attri_index_dict,customer_index_attri_dict = create_attri_index_dict("../input/h-and-m-personalized-fashion-recommendations/customers.csv")

print(article_attri_index_dict)
print(customer_attri_index_dict)

In [None]:
import numpy as np

# 这个函数找出该item集合当中频数top k的项目，并且统计其数量
def find_high_attri(items,k= 5):
    statistic_dict = {}
    for item in tqdm(items):
        if item not in statistic_dict.keys():
            statistic_dict[item] = 1
        else:
            statistic_dict[item] += 1
    sorted_list = sorted(statistic_dict.items(), key=lambda x: x[1], reverse=True)
    
    new_statistic_dict = {"others":0}
    for i in range(0,k):
        new_statistic_dict[sorted_list[i][0]] = sorted_list[i][1]
    for i in range(k,len(sorted_list)):
        new_statistic_dict["others"] += sorted_list[i][1]
    
    return new_statistic_dict

def get_attri_items(csv_path,attri_index_dict,attri_name):
    csv_reader = csv.reader(open(csv_path))
    items = []
    index = attri_index_dict[attri_name]
    for line in tqdm(csv_reader):
        items.append(line[index])
        
    return items

# 生成柱状图
def create_bar(csv_path,attri_dict,attri_name,save_path,name,k=5):
    items = get_attri_items(csv_path,attri_dict,attri_name)
    high_attri_dict = find_high_attri(items,k)
    
    others_number = high_attri_dict.pop("others")
    
    sorted_items =  sorted(high_attri_dict.items(), key=lambda x: x[1], reverse=True)
    
    if others_number != 0:
        sorted_items.append(("others",others_number))
    
    cell,pvalue = zip(* sorted_items)
    
    x = cell
    y = pvalue
    
    fig = plt.figure()
    plt.bar(x,y,0.4,color="steelblue")
    
    for a,b in zip(x,y):  
        plt.text(a,b,'%.2f'%b,ha='center',va='bottom',fontsize=7);
    plt.ylabel('p value')
    """
    index = np.arange(len(cell))
    
    width = 0.30

    figsize = (50,40)#调整绘制图片的比例
    #若是不想显示直线，可以直接将上面两行注释掉
    plt.bar(index, pvalue, width,color="#87CEFA") #绘制柱状图
    #plt.xlabel('cell type') #x轴
    plt.ylabel('p value') #y轴
    plt.title(name) #图像的名称
    plt.xticks(index, cell,fontsize=5) #将横坐标用cell替换,fontsize用来调整字体的大小
    plt.legend() #显示label
    
    x,y = cell,pvalue
    for a,b in zip(x,y):   #柱子上的数字显示
        plt.text(a,b,'%.2f'%b,ha='center',va='bottom',fontsize=7)"""
        
    plt.savefig(save_path,dpi = 2000) #保存图像，dpi可以调整图像的像素大小
    plt.show()
    

In [None]:
os.listdir("./filtered_files")

In [None]:
# 这里主要对商品类型、商品的product group、商品颜色、index name进行分析展示

try:
    os.mkdir("./figures/comparation")
    os.mkdir("./figures/comparation/articles")
except:
    print("exist")
    
    
{'article_id': 0, 'product_code': 1, 'prod_name': 2, 'product_type_no': 3, 'product_type_name': 4, 'product_group_name': 5
 , 'graphical_appearance_no': 6, 'graphical_appearance_name': 7, 'colour_group_code': 8, 'colour_group_name': 9, 
 'perceived_colour_value_id': 10, 'perceived_colour_value_name': 11, 'perceived_colour_master_id': 12, 
 'perceived_colour_master_name': 13, 'department_no': 14, 'department_name': 15, 'index_code': 16, 'index_name': 17,
 'index_group_no': 18, 'index_group_name': 19, 'section_no': 20, 'section_name': 21, 'garment_group_no': 22, 
 'garment_group_name': 23, 'detail_desc': 24}
{'customer_id': 0, 'FN': 1, 'Active': 2, 'club_member_status': 3, 'fashion_news_frequency': 4, 'age': 5, 'postal_code': 6}

# 展示的键包括：product_type_name \colour_group_name \ index_group_name\ product_group_name
create_bar("../input/h-and-m-personalized-fashion-recommendations/articles.csv",
           article_attri_index_dict,"product_type_name",
           "./figures/comparation/articles/total_article_product_type.jpg",
           "distribution of product type",
           k=5)

In [None]:
create_bar("./filtered_files/filtered_articles.csv",
           article_attri_index_dict,"product_type_name",
           "./figures/comparation/articles/filtered_article_product_type.jpg",
           "distribution of product type",
           k=5)

In [None]:
# 从类型的分布上来讲，感觉它们没有明显的区别

In [None]:
# 展示的键包括：product_type_name \colour_group_name \ index_group_name\ product_group_name
# 接下来看color的分布
create_bar("../input/h-and-m-personalized-fashion-recommendations/articles.csv",
           article_attri_index_dict,"colour_group_name",
           "./figures/comparation/articles/total_article_colour_group_name.jpg",
           "distribution of color type",
           k=5)

In [None]:
create_bar("./filtered_files/filtered_articles.csv",
           article_attri_index_dict,"colour_group_name",
           "./figures/comparation/articles/filtered_article_colour_group_name.jpg",
           "distribution of color type",
           k=5)

从上述的颜色对比来看，我们可以发现其实黑白更加受到欢迎。例如，黑色在热销的商品当中的占比是高于它在所有商品当中占比的。

In [None]:
# 展示的键包括：product_type_name \colour_group_name \ index_group_name\ product_group_name
# 接下来看index的分布
create_bar("../input/h-and-m-personalized-fashion-recommendations/articles.csv",
           article_attri_index_dict,"index_group_name",
           "./figures/comparation/articles/total_article_index_group_name.jpg",
           "distribution of index type",
           k=5)

In [None]:
create_bar("./filtered_files/filtered_articles.csv",
           article_attri_index_dict,"index_group_name",
           "./figures/comparation/articles/filtered_article_index_group_name.jpg",
           "distribution of index type",
           k=5)

可以看到，index的分布基本也有比较大的变化

In [None]:
# 展示的键包括：product_type_name \colour_group_name \ index_group_name\ product_group_name
# 接下来看product_group的分布
create_bar("../input/h-and-m-personalized-fashion-recommendations/articles.csv",
           article_attri_index_dict,"product_group_name",
           "./figures/comparation/articles/total_article_product_group_name.jpg",
           "distribution of product type",
           k=5)

In [None]:
create_bar("./filtered_files/filtered_articles.csv",
           article_attri_index_dict,"product_group_name",
           "./figures/comparation/articles/filtered_article_product_group_name.jpg",
           "distribution of product type",
           k=5)

product group也有较大的差别

In [None]:
# 接下来对customer的情况进行统计和分析

In [None]:
customer_attri_index_dict

In [None]:
try:
    os.mkdir("./figures/comparation/customers")
except:
    print("exist")

In [None]:
# 展示的键包括：club_member_status \ fashion_news_frequency
# 接下来看club_member_status的分布
create_bar("../input/h-and-m-personalized-fashion-recommendations/customers.csv",
           customer_attri_index_dict,"club_member_status",
           "./figures/comparation/customers/total_customers_club_member_status.jpg",
           "distribution of club status",
           k=3)

In [None]:

create_bar("./filtered_files/filtered_customer.csv",
           customer_attri_index_dict,"club_member_status",
           "./figures/comparation/customers/filtered_customers_club_member_status.jpg",
           "distribution of club status",
           k=3)

In [None]:
# 可以看到，购买行为较为活跃的人当中，还是active的人最多

In [None]:
# 展示的键包括：club_member_status \ fashion_news_frequency
# 接下来看club_member_status的分布
create_bar("../input/h-and-m-personalized-fashion-recommendations/customers.csv",
           customer_attri_index_dict,"fashion_news_frequency",
           "./figures/comparation/customers/total_customers_fashion_news_frequency.jpg",
           "distribution of fashio news frequence",
           k=4)

In [None]:

create_bar("./filtered_files/filtered_customer.csv",
           customer_attri_index_dict,"fashion_news_frequency",
           "./figures/comparation/customers/filtered_customers_fashion_news_frequency.jpg",
           "distribution of fashion_news_frequency",
           k=2)

In [None]:
# 以上是商品和用户的特征行为，下面来进行频繁项集挖掘

In [None]:
!pip install efficient-apriori

In [None]:
from efficient_apriori import apriori 



# 我感觉这里的置信度不太重要，仅需要通过支持度来筛选一下即可
def mining(path,min_support = 0.2,min_confidence = 1):
    csv_reader = csv.reader(open(path))
    data = []
    for item in csv_reader:
        data.append(tuple(item))
    itemsets, rules = apriori(data,min_support=min_support,min_confidence=min_confidence)
    
    return itemsets,rules

In [None]:
# 首先来对article的频繁项集进行挖掘

itemsets,rules = mining("./filtered_files/filtered_articles.csv")
print(rules)
print(itemsets)

In [None]:
# 以上的项集非常不好解读，我们采取一些过滤规则来筛选一下，主要还是筛除不好解读的数字

In [None]:
def item_filter(items_dict):
    filtered_items = {}
    items = items_dict.keys()
    
    for item in tqdm(items):
        if len(items) < 3:
            continue
        temp_item = []
        for value in item:
            try:
                float(value)
            except:
                if len(value) > 2:
                    temp_item.append(value)
        if len(temp_item) < 2:
            continue
        filtered_items[tuple(temp_item)] = items_dict[item]
    return filtered_items

In [None]:
type(itemsets)
print(itemsets.keys())
itemsets[2]
item_dict = {}
for key in itemsets.keys():
    item_dict = {**item_dict,**itemsets[key]}

In [None]:
filtered_items = item_filter(item_dict)

In [None]:
filtered_items

In [None]:
# 接下来开始探究什么样的商品容易被一起购买

def create_user_items(path = "../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv"):
    user_items_dict = {}
    csv_reader = csv.reader(open(path))
    for item in tqdm(csv_reader):
        if item[1] not in user_items_dict.keys():

            user_items_dict[item[1]] = [item[2]]
        else:
            user_items_dict[item[1]].append(item[2])
    
    return user_items_dict

In [None]:
user_items_dict = create_user_items()

In [None]:
print(len(user_items_dict.keys()))

In [None]:
# 统计一下长度分布

length_dict = {}

for key in user_items_dict.keys():
    items = user_items_dict[key]
    length = len(items)
    if length not in length_dict.keys():
        length_dict[length] = 1
    else:
        length_dict[length] += 1
length_distribution = sorted(length_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
print(length_distribution)

In [None]:
# 过滤一下长度较小的items。如果一个user购买的次数小于200，则不进行统计
filtered_items = []

for key in user_items_dict.keys():
    if len(user_items_dict[key]) < 200:
        continue
    else:
        filtered_items.append(user_items_dict[key])

print(len(filtered_items))

In [None]:
itemsets, rules = apriori(filtered_items,min_support=0.01,min_confidence=0.5)

In [None]:
print(itemsets.keys())
print(len(itemsets[3]))

In [None]:
itemsets[3]

In [None]:
len(itemsets[2])

In [None]:
article_infor_dict = {}

csv_reader = csv.reader(open("../input/h-and-m-personalized-fashion-recommendations/articles.csv"))
header_index = []
header = True

for item in tqdm(csv_reader):
    if header:
        header_index = item[1:]
        header = False
    article_infor_dict[item[0]] = item[1:]

In [None]:
def analyze_one_item(item):
    item_infor_list = []
    if 1:
        infor = article_infor_dict[item]
        for i,attri in enumerate(header_index):
            item_infor_list.append(attri+":"+infor[i])
    return item_infor_list

In [None]:
print(analyze_one_item('0706016002'))

In [None]:
# 之后来分析一下所指内容不同的item的情况

def judge(infor1,infor2):
    if infor1[0] == infor2[0]:
        return True
    else:
        return False
            
def analyze_items(items):
    items_infor = []
    first_item_infor = []
    first = True
    all_same = True
    
    for item in items:
        if first:
            first_item_infor = analyze_one_item(item)
            first = False
        items_infor.append(analyze_one_item(item))
        if not judge(analyze_one_item(item), first_item_infor):
            all_same = False
            
    return items_infor,all_same    

In [None]:
items_infor,all_same = analyze_items(('0806388001', '0806388002', '0806388003'))

In [None]:
all_same

In [None]:
print(len(itemsets[2]))

In [None]:
filtered_itemsets_infor = []
for key in itemsets[2].keys():
    itemsets_infor,same = analyze_items(key)
    if not same:
        filtered_itemsets_infor.append(itemsets_infor)

In [None]:
len(filtered_itemsets_infor)

In [None]:
filtered_3itemsets_infor = []
for key in itemsets[3].keys():
    itemsets_infor,same = analyze_items(key)
    if not same:
        filtered_3itemsets_infor.append(itemsets_infor)

In [None]:
len(filtered_3itemsets_infor)

In [None]:
for item in filtered_3itemsets_infor:
    for line in zip(*item):
        print(line)
    print("\n\n")

In [None]:
# 下面来分析不同种类客户的购买习惯，这里主要还是进行统计性的分析
# 我们首先根据高频用户的特征，将高频的用户找出来，按照三个因素进行划分：
# 展示的键包括：club_member_status \ fashion_news_frequency \ 年龄
# 年龄，这个地方因为是个数字，所以需要提前划分一下，划分成青年、中年和老年
# 其中，年龄小于30为青年，30-60为中年，60以上为老年

In [None]:
def get_split_user_set(split_standard,path):
    csv_reader = csv.reader(open(path))
    split_result_dict = {}
    
    for line in tqdm(csv_reader):
        person_type = split_standard(line)
        if person_type not in split_result_dict.keys():
            split_result_dict[person_type] = set([line[0]])
        else:
            split_result_dict[person_type].add(line[0])
    return split_result_dict
def club_member_status_split_func(line):
    return line[3]
def fashion_news_frequency_split_func(line):
    return line[4]
def age_split_func(line):
    try:
        age = int(line[5])
    except:
        age = 20
    if age < 30:
        return  "y"
    if age < 60:
        return "m"
    else:
        return "o"

In [None]:
club_split_result = get_split_user_set(club_member_status_split_func,"../input/h-and-m-personalized-fashion-recommendations/customers.csv")

In [None]:
# 根据这一个结果反查这群人购买过的物品的集合，然后给存起来
def get_user_items_dict(path):
    csv_reader = csv.reader(open(path))
    user_items_dict = {}
    for line in tqdm(csv_reader):
        if line[0] not in user_items_dict.keys():
            user_items_dict[line[1]] = set([line[2]])
        else:
            user_items_dict[line[1]].add(line[2])
    return user_items_dict
user_items_dict = get_user_items_dict("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
try:
    os.mkdir("./person")
    os.mkdir("./person/club")
except:
    print("exist")

In [None]:
def write_person(origin_articles,user_items_dict,split_diction,file_path):
    csv_writers = []
    key_list = list(split_diction.keys())
    for key in key_list:
        csv_writers.append(csv.writer(open(file_path + "/%s.csv" % key,"w")))
    article_set_for_each_type_dict = {}
    for key in key_list:
        person_set = split_diction[key]
        article_set_for_each_type_dict[key] = list()
        for person in person_set:
            if person in user_items_dict.keys():
                
                article_set_for_each_type_dict[key] += list(user_items_dict[person])
    for key in key_list:
        article_set_for_each_type_dict[key] = set(article_set_for_each_type_dict[key])
        print(len(article_set_for_each_type_dict[key]))
    key_to_id = {}
    for i,key in  enumerate(key_list):
        key_to_id[key] = i
    
    csv_reader = csv.reader(open(origin_articles))

    for line in tqdm(csv_reader):
        for key in key_list:
            if line[0] in article_set_for_each_type_dict[key]:
                csv_writers[key_to_id[key]].writerow(line)

In [None]:
print(club_split_result.keys())
club_split_result.pop("club_member_status")
club_split_result.pop("LEFT CLUB")
club_split_result.pop("")
print(club_split_result.keys())

In [None]:
len(user_items_dict.keys())
print("197a52d35209d799a9a1670a35868276a30b9dbcffb2202209691228aa1e8339" in user_items_dict.keys())

In [None]:
write_person("../input/h-and-m-personalized-fashion-recommendations/articles.csv"
             ,user_items_dict,club_split_result,
             "./person/club")

In [None]:
fashion_split_result = get_split_user_set(fashion_news_frequency_split_func,"../input/h-and-m-personalized-fashion-recommendations/customers.csv")

In [None]:
print(fashion_split_result.keys())
fashion_split_result.pop("fashion_news_frequency")
fashion_split_result.pop("")
fashion_split_result.pop("NONE")
fashion_split_result.pop("None")
print(fashion_split_result.keys())

In [None]:
try:
    os.mkdir("./person/fashion")
except:
    print("exist")

In [None]:
write_person("../input/h-and-m-personalized-fashion-recommendations/articles.csv"
             ,user_items_dict,fashion_split_result,
             "./person/fashion")

In [None]:
age_split_result = get_split_user_set(age_split_func,"../input/h-and-m-personalized-fashion-recommendations/customers.csv")

In [None]:
try:
    os.mkdir("./person/age")
except:
    print("exist")

In [None]:
write_person("../input/h-and-m-personalized-fashion-recommendations/articles.csv"
             ,user_items_dict,age_split_result,
             "./person/age")

In [None]:
print(os.listdir("./person/club"))

In [None]:
csv_reader = csv.reader(open("./person/club/ACTIVE.csv"))

In [None]:
temp_line = []

In [None]:
for line in csv_reader:
    temp_line.append(line)

In [None]:
print(len(temp_line))

In [None]:
try:
    os.mkdir("./figures/person_comparation")
    os.mkdir("./figures/person_comparation/club")
    os.mkdir("./figures/person_comparation/fashion")
    os.mkdir("./figures/person_comparation/age")
except:
    print("exist")

In [None]:
person_classification_type = ["fashion","club","age"]
article_classification_type = ["product_type_name","colour_group_name","index_group_name","product_group_name"]
for p_type in person_classification_type:
    for attri in os.listdir("./person/%s"%p_type):
        for a_type in article_classification_type:
            print("./figures/person_comparation/%s/%s-%s.jpg"%(p_type,attri.replace(".csv",""),a_type))
            create_bar("./person/%s/%s"%(p_type,attri),
               article_attri_index_dict,a_type,
               "./figures/person_comparation/%s/%s-%s.jpg"%(p_type,attri.replace(".csv",""),a_type),
               "distribution of %s"%a_type,
               k=5)

In [None]:
!ls

In [None]:
!zip figures.zip figures

In [None]:
import os
import zipfile


# 压缩
def make_zip(source_dir, output_filename):
    zipf = zipfile.ZipFile(output_filename, 'w')
    pre_len = len(os.path.dirname(source_dir))
    for parent, dirnames, filenames in os.walk(source_dir):
        for filename in filenames:
            print(filename)
            pathfile = os.path.join(parent, filename)
            arcname = pathfile[pre_len:].strip(os.path.sep)  # 相对路径
            zipf.write(pathfile, arcname)
        print()
    zipf.close()


In [None]:
make_zip("./figures","./all-figures.zip")