In [2]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from typing import Optional
import datetime

In [3]:
df = pd.read_csv('./result/character.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168122 entries, 0 to 168121
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           168122 non-null  int64 
 1   name         168121 non-null  object
 2   name_cn      168121 non-null  object
 3   other_names  110541 non-null  object
 4   birthday     25188 non-null   object
 5   gender       137594 non-null  object
dtypes: int64(1), object(5)
memory usage: 7.7+ MB


In [4]:
df = df[df['birthday'].notnull() & df['gender'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23509 entries, 0 to 167951
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           23509 non-null  int64 
 1   name         23509 non-null  object
 2   name_cn      23509 non-null  object
 3   other_names  20538 non-null  object
 4   birthday     23509 non-null  object
 5   gender       23509 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [5]:
pd.set_option('display.max_rows',None)
df['gender'].value_counts()

gender
女                            13551
男                             9693
♀                               28
雄                               27
不明                              18
雌                               17
♂                               16
女性                              12
未知                              10
？                               10
无                                7
男（性自认）                           7
公                                6
男性                               5
男（α）                             5
男→女                              4
男（Ω）                             3
？？                               3
女（？）                             3
母                                3
女[漫画] / 男(变身前)→女(变身后)[动画]        3
女（女性人造人）                         2
女？                               2
男/女                              2
代永                               2
不详                               2
无性别                              2
女（雌？）                            2
不定           

In [7]:
def map_gender(gender:str):
    result = None
    match gender:
        case "男":
            result = "M"
        case "女":
            result = "F"
        case "雄" | "雌" | "♀" | "♂":
            # 有furry!
            result = None
        case _:
            result = None
    return result

df['gender'] = df['gender'].astype('str').map(map_gender)
df['gender'].value_counts()
print("info")
df.info()

info
<class 'pandas.core.frame.DataFrame'>
Index: 23509 entries, 0 to 167951
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           23509 non-null  int64 
 1   name         23509 non-null  object
 2   name_cn      23509 non-null  object
 3   other_names  20538 non-null  object
 4   birthday     23509 non-null  object
 5   gender       0 non-null      object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [8]:
df = df[df['birthday'].notnull() & df['gender'].notnull() & (df['gender']=='F')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           0 non-null      int64 
 1   name         0 non-null      object
 2   name_cn      0 non-null      object
 3   other_names  0 non-null      object
 4   birthday     0 non-null      object
 5   gender       0 non-null      object
dtypes: int64(1), object(5)
memory usage: 0.0+ bytes


In [9]:
df['birthday'].value_counts()

Series([], Name: count, dtype: int64)

In [10]:
def birthday2year(birthday:str):
    result = None
    try:
        # yyyy-mm-dd
        result = datetime.datetime.strptime(birthday, '%Y-%m-%d').year
    except ValueError as e:
        try:
            # yyyy年mm月dd日
            result = datetime.datetime.strptime(birthday, '%Y年%m月%d日').year
        except ValueError as e:
            result = None
    return result

def birthday2month(birthday:str):
    result = None
    # 其他不处理了
    try:
        # yyyy-mm-dd
        result = datetime.datetime.strptime(birthday, '%Y-%m-%d').month
    except ValueError as e:
        try:
            # ????-mm-dd
            result = datetime.datetime.strptime(birthday, '%????-%m-%d').month
        except ValueError as e:
            try:
                # mm-dd
                result = datetime.datetime.strptime(birthday, '%m-%d').month
            except ValueError as e:
                try:
                    # yyyy年mm月dd日
                    result = datetime.datetime.strptime(birthday, '%Y年%m月%d日').month
                except ValueError as e:
                    try:
                        # ????年mm月dd日
                        result = datetime.datetime.strptime(birthday, '????年%m月%d日').month
                    except ValueError as e:
                        try:
                            # mm月dd日
                            result = datetime.datetime.strptime(birthday, '%m月%d日').month
                        except ValueError as e:
                            try:
                                # mm/dd
                                result = datetime.datetime.strptime(birthday, '%m/%d').month
                            except ValueError as e:
                                result = None
    return result

def birthday2day(birthday:str):
    result = None
    try:
        # yyyy-mm-dd
        result = datetime.datetime.strptime(birthday, '%Y-%m-%d').day
    except ValueError as e:
        try:
            # ????-mm-dd
            result = datetime.datetime.strptime(birthday, '%????-%m-%d').day
        except ValueError as e:
            try:
                # mm-dd
                result = datetime.datetime.strptime(birthday, '%m-%d').day
            except ValueError as e:
                try:
                    # yyyy年mm月dd日
                    result = datetime.datetime.strptime(birthday, '%Y年%m月%d日').day
                except ValueError as e:
                    try:
                        # ????年mm月dd日
                        result = datetime.datetime.strptime(birthday, '????年%m月%d日').day
                    except ValueError as e:
                        try:
                            # mm月dd日
                            result = datetime.datetime.strptime(birthday, '%m月%d日').day
                        except ValueError as e:
                            try:
                                # mm/dd
                                result = datetime.datetime.strptime(birthday, '%m/%d').day
                            except ValueError as e:
                                result = None
    return result

In [11]:
df['year'] = df['birthday'].map(birthday2year).astype('Int64')
df['month'] = df['birthday'].map(birthday2month).astype('Int64')
df['day'] = df['birthday'].map(birthday2day).astype('Int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           0 non-null      int64 
 1   name         0 non-null      object
 2   name_cn      0 non-null      object
 3   other_names  0 non-null      object
 4   birthday     0 non-null      object
 5   gender       0 non-null      object
 6   year         0 non-null      Int64 
 7   month        0 non-null      Int64 
 8   day          0 non-null      Int64 
dtypes: Int64(3), int64(1), object(5)
memory usage: 0.0+ bytes


In [12]:
df = df[df['month'].notnull() & df['day'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           0 non-null      int64 
 1   name         0 non-null      object
 2   name_cn      0 non-null      object
 3   other_names  0 non-null      object
 4   birthday     0 non-null      object
 5   gender       0 non-null      object
 6   year         0 non-null      Int64 
 7   month        0 non-null      Int64 
 8   day          0 non-null      Int64 
dtypes: Int64(3), int64(1), object(5)
memory usage: 0.0+ bytes


In [13]:
df.to_csv('./result/character_cleaned.csv', index=False)   