In [None]:
# -*- coding: utf-8 -*-
"""
2025高教社杯数学建模竞赛 C题 代码实现
作者：Qwen
日期：2025年9月6日
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体（可选）
import matplotlib

fm = matplotlib.font_manager.fontManager
fm.addfont("./仿宋_GB2312.TTF")
fm.addfont("./times.ttf")
print(fm)
# 设置中文字体和负号正常显示
plt.rcParams["font.sans-serif"] = ["FangSong_GB2312", "times"]
plt.rcParams["axes.unicode_minus"] = False

# ========================
# 1. 读取并合并数据
# ========================

# 读取男胎数据（多个sheet或重复表头）
male_files = ['附件 - 男胎检测数据.xlsx'] * 3  # 实际上文件被分成了多个sheet-like块
male_dfs = []
for file in male_files:
    df = pd.read_excel(file, engine='openpyxl', header=None)
    # 找到真正的表头行
    header_row = df[df.eq('序号').any(axis=1)].index[0]
    df = pd.read_excel(file, engine='openpyxl', header=header_row)
    male_dfs.append(df)

# 合并所有男胎数据
male_df = pd.concat(male_dfs, ignore_index=True)
male_df.drop_duplicates(subset=['序号'], keep='first', inplace=True)

# 读取女胎数据
female_files = ['附件 - 女胎检测数据.xlsx'] * 3
female_dfs = []
for file in female_files:
    df = pd.read_excel(file, engine='openpyxl', header=None)
    header_row = df[df.eq('序号').any(axis=1)].index[0]
    df = pd.read_excel(file, engine='openpyxl', header=header_row)
    female_dfs.append(df)

female_df = pd.concat(female_dfs, ignore_index=True)
female_df.drop_duplicates(subset=['序号'], keep='first', inplace=True)

print("男胎数据形状:", male_df.shape)
print("女胎数据形状:", female_df.shape)

# 添加性别标签
male_df['性别'] = '男胎'
female_df['性别'] = '女胎'

# 合并所有数据
data = pd.concat([male_df, female_df], ignore_index=True)

# ========================
# 2. 数据清洗与特征工程
# ========================

# 处理孕周列：如 "12w+3" → 12 + 3/7 ≈ 12.43
# def parse_gestational_week(week_str):
#     if pd.isna(week_str):
#         return np.nan
#     try:
#         if 'w+' in week_str:
#             w, d = week_str.split('w+')
#             return float(w) + float(d)/7
#         elif 'W+' in week_str:
#             w, d = week_str.split('W+')
#             return float(w) + float(d)/7
#         elif 'w' in week_str:
#             return float(week_str.replace('w', ''))
#         else:
#             return float(week_str)
#     except:
#         return np.nan

data['孕周_数值'] = data['检测孕周'].apply(lambda x:(int(x[::-1][1:][::-1])*7 if len(x.split('+'))==1 else int(x.split('+')[0][::-1][1:][::-1])*7+int(x.split('+')[1]))/7)

# # 处理末次月经和检测日期（统一为datetime）
# def parse_date(date_str):
#     if pd.isna(date_str):
#         return np.nan
#     try:
#         if isinstance(date_str, str):
#             return pd.to_datetime(date_str)
#         elif isinstance(date_str, (int, float)):
#             # Excel日期格式转换（从1900年开始）
#             return pd.to_datetime('1900-01-01') + pd.Timedelta(days=int(date_str)-2)
#     except:
#         return np.nan

data['末次月经_日期'] = data['末次月经'].apply(lambda x:pd.Timestamp(x))
data['检测日期_日期'] = data['检测日期'].apply(lambda x:pd.Timestamp(year=int(x/10000),month=int(int(x/100)%100),day=int(x%100)) if isinstance(x,int) else x)



# 计算实际孕周（天数差）
# data['实际孕周_天'] = (data['检测日期_日期'] - data['末次月经_日期']).dt.days
data['实际孕周_天'] = data[['末次月经_日期','检测日期_日期']].apply(lambda x:np.abs((x['检测日期_日期']-x['末次月经_日期']).days))

data['实际孕周_周'] = data['实际孕周_天'].apply(lambda x:x/7.0)

# 使用“检测孕周”为主，校验一致性
data['孕周'] = data['孕周_数值'].fillna(data['实际孕周_周'])

# 清理 BMI
data['BMI'] = pd.to_numeric(data['孕妇BMI'], errors='coerce')

# 提取男胎数据（用于问题1-3）
male_data = data[data['性别'] == '男胎'].copy()
male_data = male_data.dropna(subset=['Y染色体浓度', '孕周', 'BMI'])

# 转换 Y染色体浓度 为数值
male_data['Y染色体浓度'] = pd.to_numeric(male_data['Y染色体浓度'], errors='coerce')

# 过滤异常值
male_data = male_data[(male_data['Y染色体浓度'] >= 0) & (male_data['Y染色体浓度'] <= 0.2)]  # 浓度≤20%
male_data = male_data[(male_data['孕周'] >= 10) & (male_data['孕周'] <= 25)]
male_data = male_data[male_data['BMI'] >= 20]

print("清洗后男胎数据量:", len(male_data))

<matplotlib.font_manager.FontManager object at 0x78e4b273acf0>
男胎数据形状: (1082, 31)
女胎数据形状: (605, 31)


DateParseError: Unable to parse datetime string:  

In [7]:
male_data

Unnamed: 0,序号,孕妇代码,年龄,身高,体重,末次月经,IVF妊娠,检测日期,检测抽血次数,检测孕周,...,唯一比对的读段数,Unnamed: 20,Unnamed: 21,孕周_数值,末次月经_日期,检测日期_日期,实际孕周_天,实际孕周_周,孕周,BMI
0,1,A001,31,160.0,72.00,2023-02-01 00:00:00,自然受孕,20230429,1,11w+6,...,,,,11.857143,NaT,2023-04-29,,,11.857143,28.125000
1,2,A001,31,160.0,73.00,2023-02-01 00:00:00,自然受孕,20230531,2,15w+6,...,,,,15.857143,NaT,2023-05-31,,,15.857143,28.515625
2,3,A001,31,160.0,73.00,2023-02-01 00:00:00,自然受孕,20230625,3,20w+1,...,,,,20.142857,NaT,2023-06-25,,,20.142857,28.515625
3,4,A001,31,160.0,74.00,2023-02-01 00:00:00,自然受孕,20230716,4,22w+6,...,,,,22.857143,NaT,2023-07-16,,,22.857143,28.906250
4,5,A002,32,149.0,74.00,2023-11-09 00:00:00,自然受孕,20240219,1,13w+6,...,,,,13.857143,NaT,2024-02-19,,,13.857143,33.331832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,1078,A266,30,159.0,83.35,2022-12-29,自然受孕,2023-05-02 00:00:00,4,17w+5,...,,,,17.714286,2022-12-29,2023-05-02,124.0,17.714286,17.714286,32.969881
1078,1079,A267,28,155.0,73.76,2023-02-25,自然受孕,2023-05-17 00:00:00,1,11w+4,...,,,,11.571429,2023-02-25,2023-05-17,81.0,11.571429,11.571429,30.703133
1079,1080,A267,28,155.0,74.06,2023-02-25,自然受孕,2023-05-24 00:00:00,2,12w+4,...,,,,12.571429,2023-02-25,2023-05-24,88.0,12.571429,12.571429,30.825814
1080,1081,A267,28,155.0,74.74,2023-02-25,自然受孕,2023-05-31 00:00:00,3,13w+4,...,,,,13.571429,2023-02-25,2023-05-31,95.0,13.571429,13.571429,31.107551


In [10]:
# ========================
# 问题1：Y染色体浓度 ~ 孕周 + BMI
# ========================
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 对数变换Y浓度（更符合正态）
male_data['log_Y'] = np.log1p(male_data['Y染色体浓度'] * 100)  # ×100避免太小

# 方法1：多元线性回归
model1 = smf.ols('log_Y ~ 孕周 + BMI + 孕周:BMI', data=male_data).fit()
print("\n=== 问题1：多元线性回归结果 ===")
print(model1.summary())

# 方法2：广义可加模型（GAM）——允许非线性
from pygam import LinearGAM, s, f

gam = LinearGAM(s('孕周') + s('BMI'), fit_intercept=True)
# gam = LinearGAM(s(0, name='孕周') + s(1, name='BMI'), fit_intercept=True)
X_gam = male_data[['孕周', 'BMI']].values
y_gam = male_data['log_Y'].values
gam.fit(X_gam, y_gam)

print("\n=== GAM 模型解释方差 (R²):", gam.score(X_gam, y_gam))

# 可视化
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 散点图：Y浓度 vs 孕周，按BMI分色
sns.scatterplot(data=male_data, x='孕周', y='Y染色体浓度', hue='BMI', palette='viridis', ax=axes[0])
axes[0].set_title('Y染色体浓度 vs 孕周（按BMI着色）')
axes[0].set_ylabel('Y染色体浓度')

# 孕周与log_Y回归线
axes[1].scatter(male_data['孕周'], male_data['log_Y'], alpha=0.6)
axes[1].plot(male_data['孕周'], model1.fittedvalues, color='red', label='OLS拟合')
axes[1].set_xlabel('孕周')
axes[1].set_ylabel('log(Y浓度)')
axes[1].set_title('线性回归拟合')
axes[1].legend()

# GAM 预测曲面（简化为2D）
XX = gam.generate_X_grid()
axes[2].plot(XX[:, 0], gam.predict(XX), color='blue', label='GAM预测（孕周主效应）')
axes[2].scatter(male_data['孕周'], gam.predict(X_gam), alpha=0.4, color='gray')
axes[2].set_xlabel('孕周')
axes[2].set_ylabel('预测 log(Y浓度)')
axes[2].set_title('GAM 拟合曲线')
axes[2].legend()

plt.tight_layout()
plt.show()

# 相关性矩阵
corr_cols = ['Y染色体浓度', '孕周', 'BMI']
corr = male_data[corr_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Y浓度、孕周、BMI相关性热力图')
plt.show()


=== 问题1：多元线性回归结果 ===
                            OLS Regression Results                            
Dep. Variable:                  log_Y   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     15.86
Date:                Sat, 06 Sep 2025   Prob (F-statistic):           4.33e-10
Time:                        15:15:02   Log-Likelihood:                -503.05
No. Observations:                1064   AIC:                             1014.
Df Residuals:                    1060   BIC:                             1034.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9709      0.5

TypeError: '>=' not supported between instances of 'str' and 'int'