In [1]:
import requests
import json
import datetime
import re
import os
import xlwt



In [2]:
# 全局headers
headers = {}

In [3]:
# 根据BV号获取cid，考虑视频分P，返回一个cid列表
def get_cid(bv):
    cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv}'
    res = requests.get(cid_url)
    res_text = res.text
    res_dict = json.loads(res_text)
    part_list = res_dict['data']
    new_part_list = []
    for part in part_list:
        new_part = {
            'cid': part.get('cid'),
            'part_name': part.get('part')
        }
        new_part_list.append(new_part)
    return new_part_list

In [4]:
# 获取日期列表
def _get_one_month_date_list(cid, month):
    date_list_url = f'https://api.bilibili.com/x/v2/dm/history/index?type=1&oid={cid}&month={month}'
    res = requests.get(date_list_url, headers=headers)
    res_dict = json.loads(res.text)
    date_list = res_dict.get('data')
    return date_list



In [5]:

# 获取所有历史弹幕的日期 2016-03 datetime day+1
def get_date_history(cid_data_list):
    date_history_list = []
    for cid_item in cid_data_list:
        now = datetime.datetime.now()
        year = now.year
        month = now.month
        while True:
            # 获取一个月的日期列表
            one_month_date_list = _get_one_month_date_list(cid_item['cid'], f'{year}-{month:>02}')
            if one_month_date_list:
                one_month_date_list.reverse()
                cid_item['date_list'] = cid_item.get('date_list', [])
                cid_item['date_list'].extend(one_month_date_list)
                this_month_first_day = datetime.date(year, month, 1)  # 4月1 减 1天  上一个月的最后一天
                pre_month_last_day = this_month_first_day - datetime.timedelta(days=1)
                year = pre_month_last_day.year
                month = pre_month_last_day.month
            else:
                break
        date_history_list.append(cid_item)
    return date_history_list




In [6]:
# 下载弹幕xml文件
def _get_dan_mu_xml(cid, date):
    dan_mu_url = f'https://api.bilibili.com/x/v2/dm/history?type=1&oid={cid}&date={date}'
    res = requests.get(dan_mu_url, headers=headers)
    dan_mu_xml = res.content.decode('utf8')
    return dan_mu_xml



In [7]:
# 解析提取弹幕文件

def _parse_dan_mu(dan_mu_xml):
    reg = re.compile('<d p="([\s\S]*?)">([\s\S]+?)</d>')
    find_result = reg.findall(dan_mu_xml)
    dan_mu_list = []
    for line in find_result:
        p, dan_mu = line
        time_stamp = int(p.split(',')[4])
        date_array = datetime.datetime.fromtimestamp(time_stamp)
        send_time = date_array.strftime('%Y-%m-%d %H:%M:%S')
        dan_mu_list.append((send_time, dan_mu))
    return dan_mu_list



In [8]:
# 根据日期获取当天的弹幕
def get_all_dan_mu(date_history_list, bv):
    for item in date_history_list:
        # 没有分P的视频是没有part_name的
        part_name = item.get('part_name')
        # 不确定没有分P的视频有没有分P名，所以这里先判断一下
        filename = bv
        if part_name:
            filename = f'{bv}_{part_name}'
        with open(f'{filename}.txt', 'w', encoding='utf8') as f:
            for date in item['date_list']:
                dan_mu_xml = _get_dan_mu_xml(item['cid'], date)
                dan_mu_list = _parse_dan_mu(dan_mu_xml)
                # 只打印前每天的前1条，提升下用户体验
                print(dan_mu_list[0])
                for dan_mu_item in dan_mu_list:
                    # 使用 <;> 作为时间和弹幕的分隔符
                    line = '<;>'.join(dan_mu_item)
                    f.writelines(line)
                    f.write('\n')
                    



In [9]:
if __name__ == '__main__':
    bv = 'BV1fT4y1u7mi'
    # 查看历史弹幕必须先登录，需要发送cookies，请到浏览器登录B站，然后复制cookies
    cookie_str = """CURRENT_FNVAL=16; bfe_id=da609d6ad479671e4cd33f2670c43937; bsource=seo_baidu; PVID=1; bp_video_offset_5207512=402596602100865479; DedeUserID=5207512; DedeUserID__ckMd5=bca60d90001d713b; SESSDATA=aba92a47%2C1608059683%2C7fd46*61; bili_jct=762e8f74a930b704e7601ade00c9aa0c; CURRENT_QUALITY=80; sid=huycf6re; im_notify_type_5207512=0; stardustvideo=1; laboratory=1-1; im_local_unread_5207512=0; _uuid=0CDED6C4-0577-6872-8419-D92FDAB7F08A88855infoc; im_seqno_5207512=3; rpdid=|(JYYRRkYY~k0J'ull)JRJ~)|; buvid3=69F3CF92-E6DA-47AC-8CB6-B61E3F1D86A648881infoc; LIVE_BUVID=AUTO9115395293273073"""
    headers['cookie'] = cookie_str
    # 根据BV号获取cid,视频可能有分P，需考虑
    cid_data_list = get_cid(bv)
    # 获取所有历史弹幕的日期
    date_history_list = get_date_history(cid_data_list)
    # 根据日期获取当天的弹幕
    get_all_dan_mu(date_history_list, bv)


('2020-06-27 23:29:30', '允公允能，日新月异')
('2020-06-26 23:42:54', '仙交牛逼！')
('2020-06-25 23:54:04', '南开南开 难以离开')
('2020-06-24 23:34:43', '北疆？我石河子大学欢迎各位梦想家。')
('2020-06-23 23:48:22', '欢迎去优秀的北理，冲呀！')
('2020-06-22 23:57:16', '西交冲啊！西交冲啊！')
('2020-06-21 22:52:03', '海纳百川取则行远')
('2020-06-20 23:40:42', '欢迎报考浙江大学软件学院')
('2020-06-19 23:58:14', '泪目')
('2020-06-18 23:27:14', '欢迎报考中国政法大学！！！')
('2020-06-17 23:44:32', '西交大等我')
('2020-06-16 23:50:17', '前程似锦！！！')
('2020-06-15 23:11:20', '欢迎报考中国政法大学！')
('2020-06-14 23:43:07', '清华我来啦')
('2020-06-13 23:47:19', '清华等我')
('2020-06-12 23:40:14', '勤勇忠信')
('2020-06-11 20:03:21', '这个声音好绝')
('2020-06-10 23:45:44', '山大没排面')
('2020-06-09 23:58:39', '财大nb 财大nb 财大nb')
('2020-06-08 23:59:20', '我北排面！')
('2020-06-07 23:59:54', '哈工大')
('2020-06-06 23:53:35', '南开我会去的')
('2020-06-05 23:01:43', '南开')
('2020-06-04 23:59:59', '大工大工虽然我不是大工的')
('2020-06-03 23:59:23', '欢迎报考 THU')
('2020-06-02 23:54:04', '2021二校门见！')
('2020-06-01 23:58:59', '南开nb')
('2020-05-31 23:59:47', '欢迎报考北京大学')
(