In [13]:
import numpy as np
import pandas as pd
import matplotlib,re,math
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator


%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False 

In [14]:
data1=pd.read_excel('表1-患者列表及临床信息.xlsx')
data2=pd.read_excel('表2-患者影像信息血肿及水肿的体积及位置.xlsx')
data3=pd.read_excel('表3-患者影像信息血肿及水肿的形状及灰度分布.xlsx')
data4=pd.read_excel('表4-答案文件.xlsx')

data1.rename(columns={data1.columns[0]: "ID"}, inplace=True)
data2.rename(columns={data2.columns[0]: "ID"}, inplace=True)

data_f_1=pd.read_excel('附表1-检索表格-流水号vs时间.xlsx')
data_f_time=pd.read_excel('时间点.xlsx')

In [15]:
# 取1a问题对应需要用的数据
data1_columns = ['ID','数据集划分', '入院首次影像检查流水号', '发病到首次影像检查时间间隔']
data1_1a = data1[data1_columns]

data2.rename(columns={data2.columns[0]: "ID"}, inplace=True)
data2_columns = ['ID'] + [col for col in data2.columns if col.startswith('HM_volume') or col.startswith('随访')] # '首次检查流水号'
data2_1a = data2[data2_columns]

In [16]:
# 合并和保存数据
_1a=pd.merge(data1_1a,data2_1a,how='outer',on='ID')
# _1a.to_csv('1a数据.csv', index=False, encoding='utf-8')
_1a.to_excel('1a数据.xlsx', index=False)

In [17]:
# 转换为时间戳格式
columns_to_convert = data_f_time.columns[1:]
data_f_time[columns_to_convert] = data_f_time[columns_to_convert].apply(pd.to_datetime)
# Datetime对象转换为秒级的时间戳形式
data_f_time[columns_to_convert] = data_f_time[columns_to_convert].apply(lambda x: x.astype(int) // 10**9)
# 将负数的时间戳转变为空值
data_f_time[data_f_time.columns[1:]] = data_f_time[data_f_time.columns[1:]].apply(lambda x: x.mask(x < 0))

In [18]:
# 找到两个 DataFrame 共有的列名
common_columns = data_f_time.columns.intersection(_1a.columns)
# 使用 data_f_time 中的列替换 _1a 中的相同列名的列
_1a[common_columns] = data_f_time[common_columns]

In [19]:
data_f_time.columns

Index(['ID', '入院首次影像检查流水号', '随访1流水号', '随访2流水号', '随访3流水号', '随访4流水号', '随访5流水号',
       '随访6流水号', '随访7流水号', '随访8流水号'],
      dtype='object')

In [20]:
_1a.columns

Index(['ID', '数据集划分', '入院首次影像检查流水号', '发病到首次影像检查时间间隔', 'HM_volume', '随访1流水号',
       'HM_volume.1', '随访2流水号', 'HM_volume.2', '随访3流水号', 'HM_volume.3',
       '随访4流水号', 'HM_volume.4', '随访5流水号', 'HM_volume.5', '随访6流水号',
       'HM_volume.6', '随访7流水号', 'HM_volume.7', '随访8流水号', 'HM_volume.8'],
      dtype='object')

In [21]:
# 取时间做散点图
result_list = []
flow_cols = [col for col in _1a.columns if col.endswith('流水号')]
hm_cols = [col for col in _1a.columns if col.startswith('HM_')]
print(flow_cols, hm_cols)

# 修改寻访的变为以0开始的时间戳
cumulative_value_lists = []
p = 0 # 更新第几列
for i in range(8):
    cumulative_value_list = []
    q = 0
    for j,k,m in zip(_1a[flow_cols[i]], _1a[flow_cols[i+1]], _1a['发病到首次影像检查时间间隔']):
        if j == float(np.nan) or k == float(np.nan): # 无记录继承前面的记录
            cumulative_value = cumulative_value_lists[p-1][q]
        else:
            if i == 0:
                cumulative_value = 0
                cumulative_value += (m * 3600 + (k - j))
                cumulative_value_list.append(cumulative_value)
            else:
                cumulative_value = cumulative_value_lists[p-1][q] # 继承上一次检测的时间戳
                cumulative_value += (k - j)
                cumulative_value_list.append(cumulative_value)
        q += 1 # 更新第几行
    cumulative_value_lists.append(cumulative_value_list)
    p += 1

# 最后才更新数据，前面只存储了每个时间戳以0开始的值
for i in range(8):
    for j,k,m in zip(_1a[flow_cols[i]], _1a[flow_cols[i+1]], _1a['发病到首次影像检查时间间隔']):
        _1a[flow_cols[i+1]] = np.array(cumulative_value_lists[i])

# 修改首次入院检查时的时间戳
_1a['入院首次影像检查流水号'] = _1a['发病到首次影像检查时间间隔'] * 3600
# 此时_1a内流水号全为时间戳形式了

# 画总体的散点图
'''
# [(时间戳，体积大小)]
for i,j in zip(flow_cols, hm_cols):
    for l,m in zip(_1a[i],_1a[j]):
        if l <= 48 * 3600: # 只取48小时内的数据
            result_list.append((l,m))

# 去除空值再画图
result_list = [item for item in result_list if not any(math.isnan(value) for value in item)]

x_values = [item[0] for item in result_list]
y_values = [item[1] for item in result_list]

# 绘制散点图
plt.scatter(x_values, y_values)

# 添加标题和标签
plt.title('1a')
plt.xlabel('timestamp')
plt.ylabel('HM_volume')

# 显示图形
plt.show()
'''

selected_cols = []
for i,j in zip(flow_cols, hm_cols):
    selected_cols.append(i)
    selected_cols.append(j)
selected_df = _1a[selected_cols]

['入院首次影像检查流水号', '随访1流水号', '随访2流水号', '随访3流水号', '随访4流水号', '随访5流水号', '随访6流水号', '随访7流水号', '随访8流水号'] ['HM_volume', 'HM_volume.1', 'HM_volume.2', 'HM_volume.3', 'HM_volume.4', 'HM_volume.5', 'HM_volume.6', 'HM_volume.7', 'HM_volume.8']


In [22]:
selected_df

Unnamed: 0,入院首次影像检查流水号,HM_volume,随访1流水号,HM_volume.1,随访2流水号,HM_volume.2,随访3流水号,HM_volume.3,随访4流水号,HM_volume.4,随访5流水号,HM_volume.5,随访6流水号,HM_volume.6,随访7流水号,HM_volume.7,随访8流水号,HM_volume.8
0,9000.0,69714,29766.0,74902.0,475590.0,70952.0,935055.0,62831.0,1531938.0,44029.0,,,,,,,,
1,10800.0,47500,53724.0,52271.0,249199.0,47748.0,1612859.0,13055.0,4029091.0,20.0,,,,,,,,
2,7200.0,86396,34281.0,106042.0,142545.0,103263.0,,,,,,,,,,,,
3,3600.0,45498,61107.0,39877.0,301843.0,16622.0,819004.0,8441.0,,,,,,,,,,
4,18000.0,14832,95283.0,24472.0,352595.0,25477.0,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,19800.0,32559,89575.0,35138.0,452217.0,34241.0,1055942.0,14377.0,,,,,,,,,,
156,1800.0,18150,4920.0,37650.0,25620.0,140688.0,72240.0,123926.0,,,,,,,,,,
157,10080.0,27969,66180.0,27071.0,256680.0,24119.0,1111140.0,3647.0,,,,,,,,,,
158,10800.0,53154,95520.0,126642.0,,,,,,,,,,,,,,


In [23]:
selected_df.to_excel('1a数据_已替换时间戳_已矫正.xlsx', index=False)

In [36]:
from scipy import optimize
def trig_func(x,a,b,c):
    return a*np.sin(x)+b*np.cos(x)+c


i=1
for index, row in selected_df.iterrows():
    while i<=1:
        # 去除NaN值
        data = row.values[~np.isnan(row.values)]
        # 将数据分成x和y坐标对
        x = data[::2] / (3600 * 24)
        y = data[1::2]/1000
        a1 = np.polyfit(x, y, 1)#线性
        p1 = np.poly1d(a1)
        a2 = np.polyfit(x, y, 2)#二次
        p2= np.poly1d(a2)
        a3 = np.polyfit(x, y, 3)#三次
        p3= np.poly1d(a3)
        #拟合三角函数模型
        params,_=optimize.curve_fit(trig_func,x,y)
        
        print(p1)
        print(p2)
        print(p3)
        print(params)
        i+=1

 
-1.523 x + 75
         2
-0.1092 x + 0.3411 x + 72.22
          3          2
0.002701 x - 0.1805 x + 0.7726 x + 72.04
[16.64098354 -5.30949225 74.71773271]


Collecting scipy
  Obtaining dependency information for scipy from https://files.pythonhosted.org/packages/0e/a0/8606a7eef659f3d5f79d9efb92eed3ed1243178f4ae962614e1b202935a6/scipy-1.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scipy-1.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.11.2
[0m