In [None]:
import pandas as pd
import sqlite3
import re
import matplotlib.pyplot as plt

from datetime import datetime
from peewee import *


In [None]:
sqlite_db = SqliteDatabase('nhc-data.db', pragmas={'journal_mode': 'wal'})

class BaseModel(Model):
    """A base model that will use our Sqlite database."""
    class Meta:
        database = sqlite_db

class YqtbDataModel(BaseModel):

    class Meta:
        db_table = 'yqtb_data'
    
    id = PrimaryKeyField()
    gmt_date = IntegerField()
    source = TextField()
    province = TextField()
    city = TextField()
    diagnosed_cnt = IntegerField()
    carrier_cnt = IntegerField()

In [None]:
province_list = ['北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '上海', 
'江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '广西', '海南', '重庆', 
'四川', '贵州', '云南', '西藏', '陕西', '甘肃', '青海', '宁夏', '新疆', '台湾', '香港', '澳门']

In [None]:
def get_con():
    con = sqlite3.connect('nhc-data.db')
    return con

In [None]:
def province_base_parse(core_str):
    item_list = []
    for province in province_list:
        regex = province + '(\d+)例'
        pm = re.search(regex, core_str)
        if pm is None:
            continue
        item_list.append({
            'province': province,
            'value': int(pm.group(1))
        })
    return item_list

In [None]:
def comma_base_parse(source, core_str):
    item_list = []
    for word in core_str.split('，'):
        pm = re.search('^(.*)\d+', word)
        if pm is None:
            raise ValueError('section: {} word: {}'.format(core_str, word))
        province = pm.group(1)
        diagnosed = re.search('\d+', word).group()
        item_list.append({
            'source': source, 
            'province': province, 
            'diagnosed': diagnosed
        })

In [None]:
def semicolon_base_parse(core_str):
    item_list = []
    for word in core_str.split('；'):
        word = word.split('，')[0]
        diagnosed = re.search('\d+', word).group()
        province = word.split(diagnosed)[0]
        item_list.append({
            'province': province,
            'value': int(diagnosed)
        })
    return item_list

In [None]:
def save_item(item):
    gmt_date = item.get('gmt_date')
    province = item.get('province')
    city = item.get('city', province)
    source = item.get('source')
    entity = YqtbDataModel.get_or_none(YqtbDataModel.gmt_date == gmt_date, YqtbDataModel.province == province, \
        YqtbDataModel.city == city, YqtbDataModel.source == source)
    if entity is None:
        entity = YqtbDataModel()
        entity.gmt_date = gmt_date
        entity.province = province
        entity.city = city
        entity.source = source
    
    if item.get('diagnosed_cnt') is not None:
        entity.diagnosed_cnt = item.get('diagnosed_cnt')
    if item.get('carrier_cnt') is not None:
        entity.carrier_cnt = item.get('carrier_cnt')
    entity.save()


In [None]:
def process_row(row):
    title = row['link_data']
    date_date = row['date_date']
    match = re.search('截至(.*)24时新型冠状病毒肺炎疫情最新情况', title)
    gmt_date = None
    if match:
        date_str = match.group(1)
        gmt_date = datetime.strptime(date_date.split('-')[0] + date_str, '%Y%m月%d日')

    line_list = row['content'].split('\n')
    line_list = [line for line in line_list if line.strip() != '']

    paragraph_1 = line_list[0]
    paragraph_2 = line_list[1]
    paragraph_3 = line_list[2]
    paragraph_4 = line_list[3]
    paragraph_5 = line_list[4]
    paragraph_6 = line_list[5]

    process_paragraph_1(paragraph_1, gmt_date)
    process_paragraph_5(paragraph_5, gmt_date)

In [None]:

def process_paragraph_1(paragraph_1, gmt_date):
    section_remote = re.search('(境外输入病例.*)本土病例', paragraph_1).group(1)
    section_local = re.search('本土病例(.*)。', paragraph_1).group(1)
    process_section_remote(section_remote, gmt_date)
    process_section_local(section_local, gmt_date)

In [None]:
def process_paragraph_5(paragraph_5, gmt_date):
    match_list = re.findall('（([\u4e00-\u9fa5|，、；\d]+)）', paragraph_5)
    if len(match_list) > 1:
        core_str = match_list[1]
        item_list = province_base_parse(core_str)
        for item in item_list:
            item['source'] = '本土'
            item['gmt_date'] = gmt_date.strftime('%Y-%m-%d')
            item['carrier_cnt'] = item['value']
            save_item(item)

In [None]:
def process_section_remote(section_remote, gmt_date):
    core_str = re.search('.*（(.*)）.*无症状感染者转为确诊病例', section_remote).group(1)
    item_list = province_base_parse(core_str)
    for item in item_list:
        item['source'] = '境外'
        item['gmt_date'] = gmt_date.strftime('%Y-%m-%d')
        item['diagnosed_cnt'] = item['value']
        save_item(item)



In [None]:
def process_section_local(section_local, gmt_date):
    core_str = re.search('.*（(.*)）.*无症状感染者转为确诊病例', section_local).group(1)
    item_list = semicolon_base_parse(core_str)
    for item in item_list:
        item['gmt_date'] = gmt_date.strftime('%Y-%m-%d')
        item['source'] = '本土'
        item['diagnosed_cnt'] = item['value']
        save_item(item)


In [None]:
o_data = pd.read_csv('nhc-data.csv', encoding='utf-8')
for index, row in o_data.iterrows():
    process_row(row)
# first_row = o_data.iloc[0]
# process_row(first_row)

In [None]:
def print_graph():

    x = []
    y = []
    z = []
    yz = []
    query = YqtbDataModel.select().where(YqtbDataModel.source == '本土', YqtbDataModel.province == '上海', YqtbDataModel.city == '上海').order_by(YqtbDataModel.gmt_date)
    for entity in query:
        x.append(entity.gmt_date.split('-')[2])
        y.append(entity.diagnosed_cnt)
        z.append(entity.carrier_cnt)
        yz.append(entity.diagnosed_cnt + entity.carrier_cnt)
    plt.plot(x, y)
    plt.show()

    plt.plot(x, z)
    plt.show()

    plt.plot(x, yz)
    plt.show()
print_graph()