In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

In [2]:
# info of patients with ibd
file_path = os.path.join(os.path.abspath('..'), 'data', 'patients_ibd.csv')

patients_ibd = pd.read_csv(file_path)
patients_ibd.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10098672,M,61,2140,2011 - 2013,
1,10303503,F,23,2144,2008 - 2010,
2,10312715,M,39,2176,2008 - 2010,
3,10318500,F,46,2194,2011 - 2013,
4,10410021,M,49,2135,2011 - 2013,


In [9]:
# 划分年龄段
def category_from_age(age: int) -> str:
    if age < 18:
        return '<18'
    elif age < 35:
        return '18-35'
    elif age < 55:
        return '36-55'
    else:
        return '>55'

patients_ibd['age_group'] = patients_ibd.anchor_age.apply(category_from_age)
patients_ibd.head(10)

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,age_group
0,10098672,M,61,2140,2011 - 2013,,>55
1,10303503,F,23,2144,2008 - 2010,,18-35
2,10312715,M,39,2176,2008 - 2010,,36-55
3,10318500,F,46,2194,2011 - 2013,,36-55
4,10410021,M,49,2135,2011 - 2013,,36-55
5,10456993,F,32,2186,2008 - 2010,,18-35
6,10509294,M,39,2131,2008 - 2010,,36-55
7,10578743,M,71,2165,2008 - 2010,2176-09-19,>55
8,10599735,M,60,2119,2011 - 2013,,>55
9,10645926,F,44,2189,2008 - 2010,,36-55


In [20]:
# 死亡人数 存活人数
live_patients = patients_ibd[~pd.notna(patients_ibd.dod)]
dead_patients = patients_ibd[pd.notna(patients_ibd.dod)]

print('Number of live patients:', live_patients.shape[0])
print('Number of dead patients:', dead_patients.shape[0])
mortality_rate = dead_patients.shape[0] / (live_patients.shape[0] + dead_patients.shape[0])
print('Mortality rate:', round(mortality_rate, 6))

Number of live patients: 2013
Number of dead patients: 367
Mortality rate: 0.154202


In [22]:
def describe_each_attr(df: pd.DataFrame, not_needed=['subject_id', 'anchor_age', 'anchor_year', 'dod', 'days_since_anchor']) -> None:
    for col_name in df.columns:
        if col_name in not_needed: continue
        print(df[col_name].value_counts())
        x = df[col_name].value_counts()
        print(x / x.sum() * 100)
        print()

describe_each_attr(patients_ibd)

gender
F    1324
M    1056
Name: count, dtype: int64
gender
F    55.630252
M    44.369748
Name: count, dtype: float64

anchor_year_group
2008 - 2010    1332
2011 - 2013     712
2014 - 2016     336
Name: count, dtype: int64
anchor_year_group
2008 - 2010    55.966387
2011 - 2013    29.915966
2014 - 2016    14.117647
Name: count, dtype: float64

age_group
>55      1045
36-55     767
18-35     568
Name: count, dtype: int64
age_group
>55      43.907563
36-55    32.226891
18-35    23.865546
Name: count, dtype: float64



In [14]:
# 添加死亡患者的存活天数 
days_since_anchor = pd.to_datetime(dead_patients.dod, format='%Y-%m-%d') - pd.to_datetime(dead_patients.anchor_year, format='%Y')
dead_patients.loc[:, 'days_since_anchor'] = days_since_anchor

In [23]:
describe_each_attr(dead_patients)

gender
F    212
M    155
Name: count, dtype: int64
gender
F    57.765668
M    42.234332
Name: count, dtype: float64

anchor_year_group
2008 - 2010    223
2011 - 2013    103
2014 - 2016     41
Name: count, dtype: int64
anchor_year_group
2008 - 2010    60.762943
2011 - 2013    28.065395
2014 - 2016    11.171662
Name: count, dtype: float64

age_group
>55      312
36-55     46
18-35      9
Name: count, dtype: int64
age_group
>55      85.013624
36-55    12.534060
18-35     2.452316
Name: count, dtype: float64



In [None]:
# Group by ... from days_since_anchor
def groupby_a_from_b(df, a, b):
    return df.loc[:, [a, b]].groupby(a).mean()

print(groupby_a_from_b(dead_patients, 'gender', 'days_since_anchor'))
print(groupby_a_from_b(dead_patients, 'anchor_year_group', 'days_since_anchor'))
print(groupby_a_from_b(dead_patients, 'anchor_age', 'days_since_anchor'))

In [None]:
groupby_age_from_days_since_anchor = groupby_a_from_b(dead_patients, 'anchor_age', 'days_since_anchor')
x = groupby_age_from_days_since_anchor.index
y = groupby_age_from_days_since_anchor.days_since_anchor.dt.days

plt.figure(figsize=(10, 5), dpi=300)
plt.bar(x, y, width=1)
plt.ylabel('mean days since anchor', fontsize=12)
plt.xlabel('anchor age', fontsize=12)

plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.serif'] = [r'C:\Windows\Fonts\Times New Roman.ttf']
plt.show()

In [None]:
# Matrix
X = np.matrix([
    [1, 2, 3],
    [2, 3, 4]
]).T
print(f'X:\n{X}\n')

W = np.matrix([
    [1],
    [2]
])
print(f'W:\n{W}\n')

print(f'Y=XW:\n{X@W}')