# 2.项目总数统计和预测

## 2.1纯统计

In [None]:
import matplotlib.pyplot as plt

# 按年份和国家（NOC）统计参赛人数
participation_by_year_country = athletes.groupby(['Year', 'NOC']).size().unstack(fill_value=0)

# 绘制折线图
plt.figure(figsize=(10, 6))
participation_by_year_country.plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=1.3, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


# 2.2 数据清洗
- 去除Year中后三项全为0的列
- 删除东道主点
- 删除离群点，阈值为100
- 线性插值填充NAN

In [None]:
# 按年份和国家（NOC）统计参赛人数
participation_by_year_country = athletes.groupby(['Year', 'NOC']).size().unstack(fill_value=0)

# 删除最后三行全为0的列
participation_by_year_country_q1_clean = participation_by_year_country.loc[:, participation_by_year_country.iloc[-3:].sum(axis=0) != 0]

# 遍历 hosts 表的每一行，获取 Year 和 NOC
for _, row in hosts.iterrows():
    year = row['Year']
    noc = row['NOC']
    
    # 如果该 Year 和 NOC 在 participation_by_year_country_q1_clean 中，设置为 NaN
    if year in participation_by_year_country_q1_clean.index and noc in participation_by_year_country_q1_clean.columns:
        participation_by_year_country_q1_clean.at[year, noc] = None  # 设置为 NaN

# 设置一个阈值，假设阈值为某个差异的倍数，可以根据数据调整
threshold = 100

# 对每个国家的参赛人数进行遍历，计算相邻年份之间的差异
for country in participation_by_year_country_q1_clean.columns:
    for year in range(1, len(participation_by_year_country_q1_clean)):
        # 计算当前年份和上一年份之间的差异
        previous_value = participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year - 1], country]
        current_value = participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year], country]
        
        difference = abs(current_value - previous_value)
        
        # 如果差异大于阈值，认为是坏点
        if difference > threshold:
            participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year], country] = None  # 设置为 NaN

# 对坏点（NaN）进行线性插值填充
participation_by_year_country_q1_clean = participation_by_year_country_q1_clean.interpolate(method='linear', axis=0)



In [None]:
# 绘制折线图
plt.figure(figsize=(10, 6))
participation_by_year_country_q1_clean.plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=1.3, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
participation_by_year_country_q1_clean['USA'].plot(kind='line', marker='o', figsize=(10, 6))

plt.title('Total Participation per Year by Country')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 2.3 非线性回归
- 采用非线性回归
- 更关注末尾年份数据的信息

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Assuming 'years' and 'participants' are already extracted
years = participation_by_year_country_q1_clean.index.values.reshape(-1, 1)
participants = participation_by_year_country_q1_clean['USA'].values

# Define weights that emphasize the recent years (e.g., assign higher weight to the last years)
weights = np.linspace(1, 10, len(years))  # Linear weight increasing with time, or adjust as needed

# Initialize PolynomialFeatures for a degree-2 polynomial (you can adjust the degree)
degree = 2  # You can try higher values like 3, 4, etc.
poly = PolynomialFeatures(degree=degree)

# Transform the 'years' into polynomial features
years_poly = poly.fit_transform(years)

# Fit the polynomial regression model with weights
model = LinearRegression()
model.fit(years_poly, participants, sample_weight=weights)

# Predict for the original years and future years
predicted_participants = model.predict(years_poly)

# For future years (e.g., the next 5 years)
last_year = years[-1][0]
future_years = np.array(range(last_year + 1, last_year + 6)).reshape(-1, 1)
future_years_poly = poly.transform(future_years)
future_predicted_participants = model.predict(future_years_poly)

# Plotting the data
plt.figure(figsize=(10, 6))

# Plot the original data
participation_by_year_country_q1_clean['USA'].plot(kind='line', marker='o', figsize=(10, 6))

# Plot the polynomial regression curve
plt.plot(years, predicted_participants, color='red', linestyle='--', label=f'Polynomial Regression (Degree {degree})')

# Plot the future predictions
plt.plot(future_years, future_predicted_participants, color='green', marker='x', linestyle='-', label='Predictions')

# Titles and labels
plt.title('Total Participation per Year by Country (with Weighted Polynomial Regression)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


## 2.4 隐马尔可夫模型

In [None]:
# 假设 'participation_by_year_country_q1_clean' 是一个 DataFrame，包含每个国家的参与人数数据
country_table = participation_by_year_country_q1_clean['USA']

years = participation_by_year_country_q1_clean.index.values
participants = country_table.values

# 步骤一：定义状态空间，使用最近数据进行状态划分
min_participation = min(participants)
max_participation = max(participants)
num_states = 5  # 将参与人数划分为5个状态

# 使用最近的数据来计算状态划分
recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引

# 修正：确保状态值在0到num_states-1之间
states = np.clip(states, 0, num_states - 1)

# 步骤二：引入时间加权，给较近年份更高权重
decay_factor = 0.9
weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])

# 计算加权状态转移矩阵
transition_matrix = np.zeros((num_states, num_states))

for i in range(len(states) - 1):
    current_state = states[i]
    next_state = states[i + 1]
    
    # 加权转移次数
    weight = weights[i]
    transition_matrix[current_state, next_state] += weight

# 将转移次数转化为概率
row_sums = transition_matrix.sum(axis=1, keepdims=True)
# 处理没有转移的行，避免出现 NaN
row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误

# 对转移矩阵进行平滑处理，防止出现零概率
smooth_constant = 1e-4  # 改进平滑常数
transition_matrix += smooth_constant
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化

# 步骤三：预测未来状态
current_state = states[-1]  # 假设当前状态是最后一年的状态

# 预测下一个状态
future_state_probs = transition_matrix[current_state]

# 如果概率包含 NaN 或零概率，进行处理
future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效

# 确保概率和为1
future_state_probs /= np.sum(future_state_probs)

# 使用概率选择下一个状态
predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)

# 将预测的状态映射回参与人数区间
predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2

# 打印预测的参与人数
print(f"Predicted participation for next year: {predicted_participation}")

# 可视化数据
plt.figure(figsize=(10, 6))

# 绘制原始数据
country_table.plot(kind='line', marker='o', figsize=(10, 6))

# 显示预测的参与人数
plt.plot(years[-1] + 1, predicted_participation, 'go', label='Predicted Participation', markersize=10)

# 添加标题和标签
plt.title('Total Participation per Year by Country (with Markov Model Prediction)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 步骤一：定义状态空间，使用最近数据进行状态划分
num_states = 5  # 将参与人数划分为5个状态
decay_factor = 0.9  # 时间加权因子
smooth_constant = 1e-4  # 平滑常数

# 选择前10个国家，可以根据某种标准排序
top_countries = participation_by_year_country_q1_clean.sum().sort_values(ascending=False).head(15).index

# 创建一个空的图形，用来绘制所有国家的结果
plt.figure(figsize=(12, 8))

# 遍历前10个国家，进行预测
for index, country in enumerate(top_countries):
    country_table = participation_by_year_country_q1_clean[country]

    # 获取每个国家的年份和参与人数
    years = participation_by_year_country_q1_clean.index.values
    participants = country_table.values

    # 使用最近的数据来计算状态划分
    recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
    state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
    states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引

    # 修正：确保状态值在0到num_states-1之间
    states = np.clip(states, 0, num_states - 1)

    # 引入时间加权，给较近年份更高权重
    weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])

    # 计算加权状态转移矩阵
    transition_matrix = np.zeros((num_states, num_states))

    for i in range(len(states) - 1):
        current_state = states[i]
        next_state = states[i + 1]
        
        # 加权转移次数
        weight = weights[i]
        transition_matrix[current_state, next_state] += weight

    # 将转移次数转化为概率
    row_sums = transition_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误

    # 对转移矩阵进行平滑处理，防止出现零概率
    transition_matrix += smooth_constant
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化

    # 步骤三：预测未来状态
    current_state = states[-1]  # 假设当前状态是最后一年的状态

    # 预测下一个状态
    future_state_probs = transition_matrix[current_state]

    # 如果概率包含 NaN 或零概率，进行处理
    future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效

    # 确保概率和为1
    future_state_probs /= np.sum(future_state_probs)

    # 使用概率选择下一个状态
    predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)

    # 将预测的状态映射回参与人数区间
    predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2

    # 绘制原始数据
    line_color = plt.cm.hsv(index*20)  # 使用不同的颜色图
    plt.plot(years, participants, marker='o', linestyle='-', markersize=6, color=line_color)

    # 显示预测的参与人数，稍微向后移动预测的 x 轴位置
    plt.plot(years[-1] + 4, predicted_participation, '*', label=f'{country} ', markersize=10, color=line_color)

# 添加标题和标签
plt.title('Total Participation per Year by Country (with Markov Model Prediction)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


### 多图输出

In [None]:
# # 步骤一：定义状态空间，使用最近数据进行状态划分
# num_states = 5  # 将参与人数划分为5个状态
# decay_factor = 0.9  # 时间加权因子
# smooth_constant = 1e-4  # 平滑常数
# 
# # 遍历每个国家，进行预测
# for country in participation_by_year_country_q1_clean.columns:
#     country_table = participation_by_year_country_q1_clean[country]
# 
#     # 获取每个国家的年份和参与人数
#     years = participation_by_year_country_q1_clean.index.values
#     participants = country_table.values
# 
#     # 使用最近的数据来计算状态划分
#     recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
#     state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
#     states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引
# 
#     # 修正：确保状态值在0到num_states-1之间
#     states = np.clip(states, 0, num_states - 1)
# 
#     # 引入时间加权，给较近年份更高权重
#     weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])
# 
#     # 计算加权状态转移矩阵
#     transition_matrix = np.zeros((num_states, num_states))
# 
#     for i in range(len(states) - 1):
#         current_state = states[i]
#         next_state = states[i + 1]
#         
#         # 加权转移次数
#         weight = weights[i]
#         transition_matrix[current_state, next_state] += weight
# 
#     # 将转移次数转化为概率
#     row_sums = transition_matrix.sum(axis=1, keepdims=True)
#     row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误
# 
#     # 对转移矩阵进行平滑处理，防止出现零概率
#     transition_matrix += smooth_constant
#     transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化
# 
#     # 步骤三：预测未来状态
#     current_state = states[-1]  # 假设当前状态是最后一年的状态
# 
#     # 预测下一个状态
#     future_state_probs = transition_matrix[current_state]
# 
#     # 如果概率包含 NaN 或零概率，进行处理
#     future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效
# 
#     # 确保概率和为1
#     future_state_probs /= np.sum(future_state_probs)
# 
#     # 使用概率选择下一个状态
#     predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)
# 
#     # 将预测的状态映射回参与人数区间
#     predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2
# 
#     # 打印预测的参与人数
#     print(f"Predicted participation for next year ({country}): {predicted_participation}")
# 
#     # 可视化数据
#     plt.figure(figsize=(10, 6))
# 
#     # 绘制原始数据
#     country_table.plot(kind='line', marker='o', figsize=(10, 6))
# 
#     # 显示预测的参与人数
#     plt.plot(years[-1] + 4, predicted_participation, 'go', label=f'Predicted Participation ({country})', markersize=10)
# 
#     # 添加标题和标签
#     plt.title(f'Total Participation per Year by Country ({country}) (with Markov Model Prediction)')
#     plt.xlabel('Year')
#     plt.ylabel('Number of Participants')
#     plt.xticks(rotation=45)
#     plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
#     plt.tight_layout()
#     plt.show()
