In [35]:
import pandas as pd
from collections import Counter
import re
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

comp_color = {
    '魔坦刺': '#cfcf95',
    '坦刺': '#858561',
    '扶桑法刺': '#9e71ad',
    '扶桑群雄刺': '#968795',
    '扶桑群雄法刺': '#6603fc',
    '扶桑刺': '#6603fc',
    '魔种': '#a35956',
    '扶桑男刺': '#bfae91',
    '扶桑法': '#8000ff',

    '七射': '#14ffa5',
    '坦射': '#ccff14',
    '攻辅射': '#42d7f5',
    '弟弟射': '#fceb30',
    '攻辅蛋': '#ffac1c',

    '九战': '#ff0000',
    '稷下战': '#ff5e5e',
    '轻战守约': '#ff8e5e',
    '魏战': '#ff0084',
    '长城战': '#f8ff6b',

    '蜀国': '#00cc18',
    '弟弟蜀': '#a1eb34',

    '法奶': '#0093f5',
    '弟妹法': '#7900fa',
    '坦法': '#ff5900'
    }

comp_to_brief = {
    '扶桑法刺': '扶桑刺',
    '扶桑刺': '扶桑刺',
    '扶桑群雄刺': '扶桑刺',
    '扶桑群雄法刺': '扶桑刺',
    '扶桑男刺': '扶桑刺',
    '魔种(无天赋)' :'魔种',
    '魔种(有天赋)' :'魔种',
    '封神攻辅射': '攻辅射',
    '吴国攻辅射': '攻辅射',
    '稷下攻辅射': '攻辅射',
    '尧天攻辅射': '攻辅射',
    '长城攻辅射': '攻辅射'
}

rank_to_score = [12, 9, 7, 5, 3, 2, 1, 0]

class Config:
    use_brief_comp = True


pd.set_option('display.max_columns', None)

In [36]:
def preprocess_data(df):
    df = df.dropna()
    df['分数'] = df['排名'].apply(lambda x: rank_to_score[int(x) - 1])
    if Config.use_brief_comp:
        df['详细阵容'] = df['阵容']
        df['阵容'] = df['详细阵容'].apply(lambda x: comp_to_brief.get(x, x))
    df['回合数'] = df['胜'] + df['负']
    return df

def check_data_integrity(df):
    groups = df.groupby(['周数', '分组', '局数'])
    for g_name, g_df in groups:
        assert g_df['排名'].tolist() == [1, 2, 3, 4, 5, 6, 7, 8], g_df['排名'].tolist()
        assert g_df['回合数'].is_monotonic_decreasing, g_df
        tmp_l = g_df['回合数'].tolist()
        assert tmp_l[0] == tmp_l[1], g_df['回合数']
        assert len(g_df['俱乐部'].unique()) == len(g_df['俱乐部']), len(g_df['俱乐部'].unique())

    # 全局BP
    groups = df.groupby(['周数', '分组', '俱乐部'])
    for g_name, g_df in groups:
        assert len(g_df['ban位'].unique()) == len(g_df['ban位']), (g_name, len(g_df['ban位'].unique()))

    return df

def filter_df(df):
    # return df
    return df[df['周数'] >= 5].reset_index()

df = pd.read_excel('result.xlsx')
df = preprocess_data(df)
df = check_data_integrity(df)
df = filter_df(df)

In [37]:
def get_comp_statistics(df):
    def get_most_ban(x):
        c = Counter(x['ban位'])
        return [(i, c[i] / len(x)) for i, count in c.most_common()]

    groups = df.groupby('阵容')
    mean_rank = groups['排名'].mean()
    var_rank = groups['排名'].var()
    sum_occur = groups['排名'].count()
    mean_score = groups['分数'].mean()
    top1_cnt = groups['排名'].apply(lambda x: (x == 1).sum())
    top1_rate = top1_cnt / sum_occur
    top3_cnt = groups['排名'].apply(lambda x: (x <= 3).sum())
    top3_rate = top3_cnt / sum_occur
    most_ban = groups.apply(get_most_ban)
    res = pd.concat([mean_score, mean_rank, var_rank, sum_occur, top1_cnt, top1_rate, top3_cnt, top3_rate, most_ban], axis=1)
    res.columns = ('平均分数', '平均排名', '排名方差', '出场次数', '登顶次数', '登顶率', '前三次数', '前三率', '阵容选用')
    res = res.sort_values(['平均分数', '平均排名', '登顶率', '前三率'], ascending=[False, True, False, False])
    res = res.fillna('-')
    return res

def get_ban_statistics(df):
    def get_most_comp(x):
        c = Counter(x['阵容'])
        return [(i, c[i]) for i, count in c.most_common()]
    groups = df.groupby('ban位')
    mean_rank = groups['排名'].mean()
    var_rank = groups['排名'].var()
    mean_score = groups['分数'].mean()
    sum_occur = groups['排名'].count()
    top1_cnt = groups['排名'].apply(lambda x: (x == 1).sum())
    top1_rate = top1_cnt / sum_occur
    top3_cnt = groups['排名'].apply(lambda x: (x <= 3).sum())
    top3_rate = top3_cnt / sum_occur
    most_comp = groups.apply(get_most_comp)
    res = pd.concat([mean_score, mean_rank, var_rank, sum_occur, top1_cnt, top1_rate, top3_cnt, top3_rate, most_comp], axis=1)
    res.columns = ('平均分数', '平均排名', '排名方差', '出场次数', '登顶次数', '登顶率', '前三次数', '前三率', '阵容选用')
    res = res.sort_values(['平均分数', '平均排名', '登顶率', '前三率'], ascending=[False, True, False, False])
    return res


def get_team_statistics(df):
    def get_most_comp(x):
        c = Counter(x['阵容'])
        return [(i, c[i]) for i, count in c.most_common()]
    groups = df.groupby('俱乐部')
    mean_rank = groups['排名'].mean()
    var_rank = groups['排名'].var()
    mean_score = groups['分数'].mean()
    sum_occur = groups['排名'].count()
    top1_cnt = groups['排名'].apply(lambda x: (x == 1).sum())
    top1_rate = top1_cnt / sum_occur
    top3_cnt = groups['排名'].apply(lambda x: (x <= 3).sum())
    top3_rate = top3_cnt / sum_occur
    most_comp = groups.apply(get_most_comp)
    res = pd.concat([mean_score, mean_rank, var_rank, sum_occur, top1_cnt, top1_rate, top3_cnt, top3_rate, most_comp], axis=1)
    res.columns = ('平均分数', '平均排名', '排名方差', '出场次数', '登顶次数', '登顶率', '前三次数', '前三率', '阵容选用')
    res = res.sort_values(['平均分数', '平均排名', '登顶率', '前三率'], ascending=[False, True, False, False])
    return res

def get_player_statistics(df):
    def get_most_comp(x):
        c = Counter(x['阵容'])
        return [(i, c[i]) for i, count in c.most_common()]
    groups = df.groupby('选手')
    mean_rank = groups['排名'].mean()
    var_rank = groups['排名'].var()
    mean_score = groups['分数'].mean()
    sum_occur = groups['排名'].count()
    top1_cnt = groups['排名'].apply(lambda x: (x == 1).sum())
    top1_rate = top1_cnt / sum_occur
    top3_cnt = groups['排名'].apply(lambda x: (x <= 3).sum())
    top3_rate = top3_cnt / sum_occur
    most_comp = groups.apply(get_most_comp)
    res = pd.concat([mean_score, mean_rank, var_rank, sum_occur, top1_cnt, top1_rate, top3_cnt, top3_rate, most_comp], axis=1)
    res.columns = ('平均分数', '平均排名', '排名方差', '出场次数', '登顶次数', '登顶率', '前三次数', '前三率', '阵容选用')
    res = res.sort_values(['平均分数', '平均排名', '登顶率', '前三率'], ascending=[False, True, False, False])
    return res

# display(get_comp_statistics(df).head())
# display(get_ban_statistics(df).head())
# display(get_team_statistics(df).head())
# display(get_player_statistics(df).head())

In [38]:
def report_mvp_comp(comp_stats):
    MIN_OCCURENCE = 5
    metric_weights = {
        '平均分数': [15, False],
        '平均排名': [5, True],
        '排名方差': [3, True],
        '出场次数': [5, False],
        '登顶率': [5, False],
        '前三率': [3, False]
    }
    comp_stats = comp_stats[comp_stats['出场次数'] >= MIN_OCCURENCE]
    raw_ranks = []
    weights = []
    for metric, value in metric_weights.items():
        weight, ascending = value
        metric_rank = comp_stats[metric].rank(ascending=ascending, method='min').astype(int)
        raw_ranks.append(metric_rank)
        weights.append(weight)
    comp_rank = sum([r * w for r, w in zip(raw_ranks, weights)]).sort_values()
    # print(comp_rank)
    mvp_comp = comp_rank.index[0]
    # print('本期MVP阵容:', mvp_comp)
    mvp_comp_metric_ranks = pd.Series({r.name: r[mvp_comp] for r in raw_ranks})
    mvp_comp_metric_values = comp_stats.loc[mvp_comp][mvp_comp_metric_ranks.index].T
    res = pd.concat([mvp_comp_metric_values, mvp_comp_metric_ranks], axis=1)
    res.columns = [mvp_comp, '全阵容对比排名']
    return res
comp_stats = get_comp_statistics(df)
display(report_mvp_comp(comp_stats))

def report_mvp_player(player_stats):
    MIN_OCCURENCE = 3
    metric_weights = {
        '平均分数': [15, False],
        '平均排名': [10, True],
        '排名方差': [5, True],
        '登顶率': [10, False],
        '前三率': [5, False],
        '出场次数': [0, False],
        '阵容选用': [0, False]
    }
    player_stats = player_stats[player_stats['出场次数'] >= MIN_OCCURENCE]
    raw_ranks = []
    weights = []
    for metric, value in metric_weights.items():
        weight, ascending = value
        metric_rank = player_stats[metric].rank(ascending=ascending, method='min').astype(int)
        raw_ranks.append(metric_rank)
        weights.append(weight)
    player_rank = sum([r * w for r, w in zip(raw_ranks, weights)]).sort_values()
    mvp_player = player_rank.index[0]
    # print('本期MVP选手:', mvp_player)
    mvp_player_metric_ranks = pd.Series({r.name: r[mvp_player] for r in raw_ranks})
    mvp_player_metric_values = player_stats.loc[mvp_player][mvp_player_metric_ranks.index].T
    res = pd.concat([mvp_player_metric_values, mvp_player_metric_ranks], axis=1)
    res.columns = [mvp_player, '全选手对比排名']
    res['全选手对比排名'] = (res['全选手对比排名'] / weights * weights).fillna('/')
    return res

player_stats = get_player_statistics(df)
display(report_mvp_player(player_stats))

def report_mvp_team(team_stats):
    metric_weights = {
        '平均分数': [15, False],
        '平均排名': [10, True],
        '排名方差': [5, True],
        '登顶率': [10, False],
        '前三率': [5, False],
        '出场次数': [0, False],
        '阵容选用': [0, False]
    }
    raw_ranks = []
    weights = []
    for metric, value in metric_weights.items():
        weight, ascending = value
        metric_rank = team_stats[metric].rank(ascending=ascending, method='min').astype(int)
        raw_ranks.append(metric_rank)
        weights.append(weight)
    team_rank = sum([r * w for r, w in zip(raw_ranks, weights)]).sort_values()
    mvp_team = team_rank.index[0]
    # print('本期MVP选手:', mvp_team)
    mvp_team_metric_ranks = pd.Series({r.name: r[mvp_team] for r in raw_ranks})
    mvp_team_metric_values = team_stats.loc[mvp_team][mvp_team_metric_ranks.index].T
    res = pd.concat([mvp_team_metric_values, mvp_team_metric_ranks], axis=1)
    res.columns = [mvp_team, '全队伍对比排名']
    res['全队伍对比排名'] = (res['全队伍对比排名'] / weights * weights).fillna('/')
    return res

team_stats = get_team_statistics(df)
display(report_mvp_team(team_stats))

def report_interesting_data(df):
    b_df = df.copy()
    b_df['选手'] = b_df['选手'].astype(str)
    b_df['俱乐部'] = b_df['俱乐部'].astype(str)
    b_df['选手'] = b_df[['俱乐部', '选手']].agg('.'.join, axis=1)
    b_df['局数'] = b_df.apply(lambda x: f'第{x["周数"]}周{x.loc["分组"]}第{x.loc["局数"]}局', axis=1)
    # 最长游戏回合
    longest_game = b_df.loc[b_df['回合数'] == b_df['回合数'].max()][['选手', '详细阵容', '回合数', '局数', '排名']]
    print(f'最长游戏:\n{longest_game}\n')
    # 最短游戏回合
    first_twos = b_df.loc[b_df['排名'] <= 2]
    shortest_game = first_twos.loc[first_twos['回合数'] == first_twos['回合数'].min()][['选手', '详细阵容', '回合数', '局数', '排名']]
    print(f'最短游戏:\n{shortest_game}\n')
    print(b_df.iloc[b_df['质量'].idxmax()])
    highest_price = b_df.iloc[b_df['质量'].idxmax()][['选手', '详细阵容', '质量', '局数', '排名']]
    print(f'最高质量:\n{highest_price}\n')
    lowest_price = b_df.iloc[b_df['质量'].idxmin()][['选手', '详细阵容', '质量', '局数', '排名']]
    print(f'最低质量:\n{lowest_price}\n')
    highest_damage = b_df.iloc[b_df['输出'].idxmax()][['选手', '详细阵容', '输出', '局数', '排名']]
    print(f'最高输出:\n{highest_damage}\n')
    lowest_damage = b_df.iloc[b_df['输出'].idxmin()][['选手', '详细阵容', '输出', '局数', '排名']]
    print(f'最低输出:\n{lowest_damage}\n')

report_interesting_data(df)

Unnamed: 0,七射,全阵容对比排名
平均分数,5.83333,2
平均排名,4.04167,2
排名方差,6.30254,7
出场次数,24.0,4
登顶率,0.25,1
前三率,0.5,2


Unnamed: 0,情书,全选手对比排名
平均分数,10,1
平均排名,1.66667,1
排名方差,0.333333,1
登顶率,0.333333,6
前三率,1,1
出场次数,3,/
阵容选用,"[(扶桑刺, 3)]",/


Unnamed: 0,KSSC,全队伍对比排名
平均分数,7.8,1
平均排名,2.9,1
排名方差,4.98889,12
登顶率,0.3,1
前三率,0.7,1
出场次数,10,/
阵容选用,"[(扶桑刺, 4), (攻辅射, 4), (魔坦刺, 1), (弟弟蜀, 1)]",/


最长游戏:
                 选手   详细阵容  回合数         局数  排名
56        XROCK.水长东   扶桑法刺   36  第5周中分组第3局   1
57         WB.TS.刺儿  封神攻辅射   36  第5周中分组第3局   2
112           XQ.奕将    弟弟射   36  第5周高分组第5局   1
113  重庆QGScholar.鸵鸟    稷下战   36  第5周高分组第5局   2

最短游戏:
                选手 详细阵容  回合数         局数  排名
88  重庆QGScholar.大牛  弟弟蜀   24  第5周高分组第2局   1
89       KS.YTG.安然  稷下战   24  第5周高分组第2局   2

index               473
排名                    2
俱乐部         重庆QGScholar
选手       重庆QGScholar.鸵鸟
ban位                  稷
阵容                  扶桑刺
质量                  168
输出                  132
胜                    24
负                     9
周数                    6
分组                  高分组
局数            第6周高分组第5局
游戏版本               21-0
分数                    9
详细阵容               扶桑法刺
回合数                  33
Name: 233, dtype: object
最高质量:
选手      重庆QGScholar.鸵鸟
详细阵容              扶桑法刺
质量                 168
局数           第6周高分组第5局
排名                   2
Name: 233, dtype: object

最低质量:
选手          CW.九戒
详细阵容      魔种(有天

In [39]:
def plot_ban_to_comp(ban_stats, filepath=None):
    fig, axs = plt.subplots(1, len(ban_stats), figsize=(30, 4))
    for ax, ban in zip(axs, ban_stats.itertuples()):
        labels, ys = [m[0] for m in ban[-1]], [m[1] for m in ban[-1]]
        colors = [comp_color[re.sub(r'\([^)]*\)', '', c).strip()] for c in labels]
        explode = [0.015] * len(ys)
        def value(val):
            return f'{val:.2f}%'
        ax.pie(ys, labels=labels, colors=colors, explode=explode, autopct=value, shadow=False, pctdistance=0.6)
        ax.set_title(f'{ban[0]} (均分{ban[1]:.2f},出场{ban[4]})', fontsize=15, x=0.5, y=1.01)

    fig.tight_layout()
    if filepath:
        fig.savefig(filepath, dpi=250)
    else:
        plt.show(fig)
    plt.close(fig)

def plot_comp_data(comp_stats, filepath=None):
    fig, ax = plt.subplots(figsize=(6, 4))
    y = comp_stats['平均排名']
    x = comp_stats['出场次数']
    names = comp_stats.index
    ax.scatter(x, y)
    ax.set_title('阵容数据')
    ax.set_ylabel('平均排名')
    ax.set_xlabel('出场次数')
    ax.set_xlim(left=0, right=max(x) + 1)
    ax.set_ylim(bottom=1, top=8)
    ax.hlines(y=4.5, xmin=0, xmax=max(x) + 1, colors='orange', linestyles='--', lw=2, label='理论均值')
    ax.legend(loc="lower right")
    for i, txt in enumerate(names):
        ax.annotate(txt, (x[i] + 0.25, y[i]))
    fig.gca().invert_yaxis()
    if filepath:
        fig.savefig(filepath, dpi=250)
    else:
        plt.show(fig)
    plt.close(fig)

comp_stats = get_comp_statistics(df)
plot_comp_data(comp_stats, "data/comp_data.png")
# plot_comp_data(comp_stats, None)
ban_stats = get_ban_statistics(df)
plot_ban_to_comp(ban_stats, "data/ban_to_comp.png")
# plot_ban_to_comp(ban_stats, None)

In [40]:
weibo_csv_path = 'weibo_crawler/weibo/王者模拟战职业大师赛/5464294919.csv'
weibo_df = pd.read_csv(weibo_csv_path)
weibo_df = weibo_df.loc[weibo_df['正文'].str.contains('决赛圈')]
display(weibo_df.head(1))
print(weibo_df.shape)



Unnamed: 0,id,bid,正文,头条文章url,原始图片url,视频url,位置,日期,工具,点赞数,评论数,转发数,话题,@用户
3,4558135123513004,JoomkeJTm,王者荣耀超话 KPL超话 #王者模拟战# #王者模拟战职业大师赛#·秋季赛·常规赛第四周...,,https://wx4.sinaimg.cn/large/005XNCJ1gy1gjj3p5...,,,2020-10-09,搜狗高速浏览器,0,0,0,"王者模拟战,王者模拟战职业大师赛",


(25, 14)
