# 2.项目总数统计和预测

## 2.1纯统计

In [None]:
import matplotlib.pyplot as plt

# 按年份和国家（NOC）统计参赛人数
participation_by_year_country = athletes.groupby(['Year', 'NOC']).size().unstack(fill_value=0)

# 绘制折线图
plt.figure(figsize=(10, 6))
participation_by_year_country.plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=1.3, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


# 2.2 数据清洗
- 去除Year中后三项全为0的列
- 删除东道主点
- 删除离群点，阈值为100
- 线性插值填充NAN

In [None]:
# 按年份和国家（NOC）统计参赛人数
participation_by_year_country = athletes.groupby(['Year', 'NOC']).size().unstack(fill_value=0)

# 删除最后三行全为0的列
participation_by_year_country_q1_clean = participation_by_year_country.loc[:, participation_by_year_country.iloc[-3:].sum(axis=0) != 0]

# 遍历 hosts 表的每一行，获取 Year 和 NOC
for _, row in hosts.iterrows():
    year = row['Year']
    noc = row['NOC']
    
    # 如果该 Year 和 NOC 在 participation_by_year_country_q1_clean 中，设置为 NaN
    if year in participation_by_year_country_q1_clean.index and noc in participation_by_year_country_q1_clean.columns:
        participation_by_year_country_q1_clean.at[year, noc] = None  # 设置为 NaN

# 设置一个阈值，假设阈值为某个差异的倍数，可以根据数据调整
threshold = 100

# 对每个国家的参赛人数进行遍历，计算相邻年份之间的差异
for country in participation_by_year_country_q1_clean.columns:
    for year in range(1, len(participation_by_year_country_q1_clean)):
        # 计算当前年份和上一年份之间的差异
        previous_value = participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year - 1], country]
        current_value = participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year], country]
        
        difference = abs(current_value - previous_value)
        
        # 如果差异大于阈值，认为是坏点
        if difference > threshold:
            participation_by_year_country_q1_clean.loc[participation_by_year_country_q1_clean.index[year], country] = None  # 设置为 NaN

# 对坏点（NaN）进行线性插值填充
participation_by_year_country_q1_clean = participation_by_year_country_q1_clean.interpolate(method='linear', axis=0)



In [None]:
# 绘制折线图
plt.figure(figsize=(10, 6))
participation_by_year_country_q1_clean.plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=1.3, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
participation_by_year_country_q1_clean['USA'].plot(kind='line', marker='o', figsize=(10, 6))

plt.title('Total Participation per Year by Country')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 2.3 非线性回归
- 采用非线性回归
- 更关注末尾年份数据的信息

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Assuming 'years' and 'participants' are already extracted
years = participation_by_year_country_q1_clean.index.values.reshape(-1, 1)
participants = participation_by_year_country_q1_clean['USA'].values

# Define weights that emphasize the recent years (e.g., assign higher weight to the last years)
weights = np.linspace(1, 10, len(years))  # Linear weight increasing with time, or adjust as needed

# Initialize PolynomialFeatures for a degree-2 polynomial (you can adjust the degree)
degree = 2  # You can try higher values like 3, 4, etc.
poly = PolynomialFeatures(degree=degree)

# Transform the 'years' into polynomial features
years_poly = poly.fit_transform(years)

# Fit the polynomial regression model with weights
model = LinearRegression()
model.fit(years_poly, participants, sample_weight=weights)

# Predict for the original years and future years
predicted_participants = model.predict(years_poly)

# For future years (e.g., the next 5 years)
last_year = years[-1][0]
future_years = np.array(range(last_year + 1, last_year + 6)).reshape(-1, 1)
future_years_poly = poly.transform(future_years)
future_predicted_participants = model.predict(future_years_poly)

# Plotting the data
plt.figure(figsize=(10, 6))

# Plot the original data
participation_by_year_country_q1_clean['USA'].plot(kind='line', marker='o', figsize=(10, 6))

# Plot the polynomial regression curve
plt.plot(years, predicted_participants, color='red', linestyle='--', label=f'Polynomial Regression (Degree {degree})')

# Plot the future predictions
plt.plot(future_years, future_predicted_participants, color='green', marker='x', linestyle='-', label='Predictions')

# Titles and labels
plt.title('Total Participation per Year by Country (with Weighted Polynomial Regression)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


## 2.4 隐马尔可夫模型

In [None]:
# 假设 'participation_by_year_country_q1_clean' 是一个 DataFrame，包含每个国家的参与人数数据
country_table = participation_by_year_country_q1_clean['USA']

years = participation_by_year_country_q1_clean.index.values
participants = country_table.values

# 步骤一：定义状态空间，使用最近数据进行状态划分
min_participation = min(participants)
max_participation = max(participants)
num_states = 5  # 将参与人数划分为5个状态

# 使用最近的数据来计算状态划分
recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引

# 修正：确保状态值在0到num_states-1之间
states = np.clip(states, 0, num_states - 1)

# 步骤二：引入时间加权，给较近年份更高权重
decay_factor = 0.9
weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])

# 计算加权状态转移矩阵
transition_matrix = np.zeros((num_states, num_states))

for i in range(len(states) - 1):
    current_state = states[i]
    next_state = states[i + 1]
    
    # 加权转移次数
    weight = weights[i]
    transition_matrix[current_state, next_state] += weight

# 将转移次数转化为概率
row_sums = transition_matrix.sum(axis=1, keepdims=True)
# 处理没有转移的行，避免出现 NaN
row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误

# 对转移矩阵进行平滑处理，防止出现零概率
smooth_constant = 1e-4  # 改进平滑常数
transition_matrix += smooth_constant
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化

# 步骤三：预测未来状态
current_state = states[-1]  # 假设当前状态是最后一年的状态

# 预测下一个状态
future_state_probs = transition_matrix[current_state]

# 如果概率包含 NaN 或零概率，进行处理
future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效

# 确保概率和为1
future_state_probs /= np.sum(future_state_probs)

# 使用概率选择下一个状态
predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)

# 将预测的状态映射回参与人数区间
predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2

# 打印预测的参与人数
print(f"Predicted participation for next year: {predicted_participation}")

# 可视化数据
plt.figure(figsize=(10, 6))

# 绘制原始数据
country_table.plot(kind='line', marker='o', figsize=(10, 6))

# 显示预测的参与人数
plt.plot(years[-1] + 1, predicted_participation, 'go', label='Predicted Participation', markersize=10)

# 添加标题和标签
plt.title('Total Participation per Year by Country (with Markov Model Prediction)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 步骤一：定义状态空间，使用最近数据进行状态划分
num_states = 5  # 将参与人数划分为5个状态
decay_factor = 0.9  # 时间加权因子
smooth_constant = 1e-4  # 平滑常数

# 选择前10个国家，可以根据某种标准排序
top_countries = participation_by_year_country_q1_clean.sum().sort_values(ascending=False).head(15).index

# 创建一个空的图形，用来绘制所有国家的结果
plt.figure(figsize=(12, 8))

# 遍历前10个国家，进行预测
for index, country in enumerate(top_countries):
    country_table = participation_by_year_country_q1_clean[country]

    # 获取每个国家的年份和参与人数
    years = participation_by_year_country_q1_clean.index.values
    participants = country_table.values

    # 使用最近的数据来计算状态划分
    recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
    state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
    states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引

    # 修正：确保状态值在0到num_states-1之间
    states = np.clip(states, 0, num_states - 1)

    # 引入时间加权，给较近年份更高权重
    weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])

    # 计算加权状态转移矩阵
    transition_matrix = np.zeros((num_states, num_states))

    for i in range(len(states) - 1):
        current_state = states[i]
        next_state = states[i + 1]
        
        # 加权转移次数
        weight = weights[i]
        transition_matrix[current_state, next_state] += weight

    # 将转移次数转化为概率
    row_sums = transition_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误

    # 对转移矩阵进行平滑处理，防止出现零概率
    transition_matrix += smooth_constant
    transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化

    # 步骤三：预测未来状态
    current_state = states[-1]  # 假设当前状态是最后一年的状态

    # 预测下一个状态
    future_state_probs = transition_matrix[current_state]

    # 如果概率包含 NaN 或零概率，进行处理
    future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效

    # 确保概率和为1
    future_state_probs /= np.sum(future_state_probs)

    # 使用概率选择下一个状态
    predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)

    # 将预测的状态映射回参与人数区间
    predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2

    # 绘制原始数据
    line_color = plt.cm.hsv(index*20)  # 使用不同的颜色图
    plt.plot(years, participants, marker='o', linestyle='-', markersize=6, color=line_color)

    # 显示预测的参与人数，稍微向后移动预测的 x 轴位置
    plt.plot(years[-1] + 4, predicted_participation, '*', label=f'{country} ', markersize=10, color=line_color)

# 添加标题和标签
plt.title('Total Participation per Year by Country (with Markov Model Prediction)')
plt.xlabel('Year')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45)
plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


### 多图输出

In [None]:
# # 步骤一：定义状态空间，使用最近数据进行状态划分
# num_states = 5  # 将参与人数划分为5个状态
# decay_factor = 0.9  # 时间加权因子
# smooth_constant = 1e-4  # 平滑常数
# 
# # 遍历每个国家，进行预测
# for country in participation_by_year_country_q1_clean.columns:
#     country_table = participation_by_year_country_q1_clean[country]
# 
#     # 获取每个国家的年份和参与人数
#     years = participation_by_year_country_q1_clean.index.values
#     participants = country_table.values
# 
#     # 使用最近的数据来计算状态划分
#     recent_years_participants = participants[-5:]  # 只考虑最近5年的数据
#     state_bins = np.percentile(recent_years_participants, np.linspace(0, 100, num_states + 1))  # 根据百分位数划分
#     states = np.digitize(participants, state_bins) - 1  # 将参与人数映射为状态索引
# 
#     # 修正：确保状态值在0到num_states-1之间
#     states = np.clip(states, 0, num_states - 1)
# 
#     # 引入时间加权，给较近年份更高权重
#     weights = np.array([decay_factor ** (len(years) - i) for i in range(len(years))])
# 
#     # 计算加权状态转移矩阵
#     transition_matrix = np.zeros((num_states, num_states))
# 
#     for i in range(len(states) - 1):
#         current_state = states[i]
#         next_state = states[i + 1]
#         
#         # 加权转移次数
#         weight = weights[i]
#         transition_matrix[current_state, next_state] += weight
# 
#     # 将转移次数转化为概率
#     row_sums = transition_matrix.sum(axis=1, keepdims=True)
#     row_sums[row_sums == 0] = 1  # 将零行的总和设置为1，以避免除零错误
# 
#     # 对转移矩阵进行平滑处理，防止出现零概率
#     transition_matrix += smooth_constant
#     transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)  # 重新归一化
# 
#     # 步骤三：预测未来状态
#     current_state = states[-1]  # 假设当前状态是最后一年的状态
# 
#     # 预测下一个状态
#     future_state_probs = transition_matrix[current_state]
# 
#     # 如果概率包含 NaN 或零概率，进行处理
#     future_state_probs = np.nan_to_num(future_state_probs, nan=1.0)  # 将 NaN 替换为 1，确保概率有效
# 
#     # 确保概率和为1
#     future_state_probs /= np.sum(future_state_probs)
# 
#     # 使用概率选择下一个状态
#     predicted_future_state = np.random.choice(range(num_states), p=future_state_probs)
# 
#     # 将预测的状态映射回参与人数区间
#     predicted_participation = (state_bins[predicted_future_state] + state_bins[predicted_future_state + 1]) / 2
# 
#     # 打印预测的参与人数
#     print(f"Predicted participation for next year ({country}): {predicted_participation}")
# 
#     # 可视化数据
#     plt.figure(figsize=(10, 6))
# 
#     # 绘制原始数据
#     country_table.plot(kind='line', marker='o', figsize=(10, 6))
# 
#     # 显示预测的参与人数
#     plt.plot(years[-1] + 4, predicted_participation, 'go', label=f'Predicted Participation ({country})', markersize=10)
# 
#     # 添加标题和标签
#     plt.title(f'Total Participation per Year by Country ({country}) (with Markov Model Prediction)')
#     plt.xlabel('Year')
#     plt.ylabel('Number of Participants')
#     plt.xticks(rotation=45)
#     plt.legend(title='Country (NOC)', bbox_to_anchor=(1.05, 1), loc='upper left')
#     plt.tight_layout()
#     plt.show()


## 归一化后的聚类

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# 1. 计算每个 NOC 值的数量
noc_counts_medal = athletes_with_medal['NOC'].value_counts()

# 将 'USA' 的数量减小 2000
noc_counts_medal['USA'] = max(noc_counts_medal['USA'] - 3000, 0)  # 防止数量变为负数

# 2. 计算每个 NOC 对应的不同 Sport 数量
sport_counts_medal_point = athletes_with_medal.groupby('NOC')['Sport'].nunique()

# 3. 计算每个 NOC 对应的不同 Sport 数量，确保它与 NOC 数量对应
noc_sport_counts = noc_counts_medal.loc[sport_counts_medal_point.index]

# 使用 MinMaxScaler 对数据进行归一化
scaler = MinMaxScaler()

# 对 noc_sport_counts 和 sport_counts 进行归一化
noc_sport_counts_normalized = scaler.fit_transform(noc_sport_counts.values.reshape(-1, 1))
sport_counts_normalized = scaler.fit_transform(sport_counts_medal_point.values.reshape(-1, 1))

# 将两个归一化的特征合并成一个数据集进行 KMeans 聚类
X = np.hstack([noc_sport_counts_normalized, sport_counts_normalized])

# 4. 使用 KMeans 聚类
kmeans = KMeans(n_clusters=3, random_state=42)  # 假设聚成 3 类
kmeans.fit(X)

# 获取聚类标签
labels = kmeans.labels_

# 5. 绘制散点图并使用不同颜色标记聚类结果
plt.figure(figsize=(10, 6))
plt.scatter(noc_sport_counts_normalized, sport_counts_normalized, c=labels, cmap='viridis')

# 在每个点上添加 NOC 标签
for noc, count, sport, label in zip(sport_counts_medal_point.index, noc_sport_counts_normalized.flatten(), sport_counts_normalized.flatten(), labels):
    plt.text(count, sport, noc, fontsize=9, ha='right', color=plt.cm.viridis(label / 3))  # 根据标签设置颜色

# 添加标签和标题
plt.xlabel('Normalized NOC Count')
plt.ylabel('Normalized Unique Sports Count')
plt.title('K-Means Clustering: Normalized NOC Count vs Normalized Unique Sports Count')

# 显示图表
plt.show()


In [None]:
country_to_noc = {
    'Great Britain': 'GBR',
    'France': 'FRA',
    'Denmark': 'DEN',
    'United States': 'USA',
    'Canada': 'CAN',
    'Hungary': 'HUN',
    'Switzerland': 'SUI',
    'Australia': 'AUS',
    'Italy': 'ITA',
    'Belgium': 'BEL',
    'Sweden': 'SWE',
    'Norway': 'NOR',
    'Netherlands': 'NED',
    'Austria': 'AUT',
    'Finland': 'FIN',
    'Japan': 'JPN',
    'Poland': 'POL',
    'New Zealand': 'NZL',
    'Mexico': 'MEX',
    'Greece': 'GRE',
    'Romania': 'ROU',
    'Brazil': 'BRA',
    'Spain': 'ESP',
    'Argentina': 'ARG',
    'South Africa': 'RSA',
    'India': 'IND',
    'Turkey': 'TUR',
    'South Korea': 'KOR',
    'Cuba': 'CUB',
    'Germany': 'GER',
    'Jamaica': 'JAM',
    'Iran': 'IRI',
    'Bulgaria': 'BUL',
    'Portugal': 'PRT',
    'Czechoslovakia': 'TCH',
    'Ireland': 'IRL',
    'Kenya': 'KEN',
    'Yugoslavia': 'YUG',
    'Mongolia': 'MNG',
    'Ethiopia': 'ETH',
    'Thailand': 'THA',
    'Egypt': 'EGY',
    'China': 'CHN',
    'North Korea': 'PRK',
    'Colombia': 'COL',
    'Estonia': 'EST',
    'Morocco': 'MAR',
    'Trinidad and Tobago': 'TTO',
    'Chinese Taipei': 'TPE',
    'Nigeria': 'NGA',
    'Venezuela': 'VEN',
    'Bahamas': 'BAH',
    'Indonesia': 'INA',
    'Pakistan': 'PAK',
    'Tunisia': 'TUN',
    'Philippines': 'PHI',
    'Puerto Rico': 'PUR',
    'Lithuania': 'LTU',
    'Croatia': 'HRV',
    'Slovenia': 'SVN',
    'Slovakia': 'SVK',
    'Ukraine': 'UKR',
    'Algeria': 'ALG',
    'Kazakhstan': 'KAZ',
    'Israel': 'ISR',
    'Uzbekistan': 'UZB',
    'Chile': 'CHL',
    'Soviet Union': 'URS',
    'Czech Republic': 'CZE',
    'Latvia': 'LAT',
    'Azerbaijan': 'AZE',
    'Georgia': 'GEO',
    'Belarus': 'BLR',
    'Malaysia': 'MAS',
    'Uganda': 'UGA',
    'Dominican Republic': 'DOM',
    'Uruguay': 'URU',
    'Armenia': 'ARM',
    'Qatar': 'QAT',
    'Russia': 'RUS',
    'Cameroon': 'CMR',
    'Peru': 'PER',
    'Serbia': 'SRB',
    'East Germany': 'GDR',
    'West Germany': 'FRG',
    'Moldova': 'MDA',
    'Hong Kong': 'HKG',
    'Syria': 'SYR',
    'Ivory Coast': 'CIV',
    'Iceland': 'ISL',
    'Ghana': 'GHA',
    'Tajikistan': 'TJK',
    'Vietnam': 'VNM',
    'Bahrain': 'BRN',
    'Ecuador': 'ECU',
    'Kyrgyzstan': 'KGZ',
    'Singapore': 'SGP',
    'Grenada': 'GRD',
    'Kosovo': 'KSV',
    'Panama': 'PAN',
    'Mixed team': 'MIX',
    'Luxembourg': 'LUX',
    'Jordan': 'JOR',
    'Botswana': 'BWA',
    'Fiji': 'FJI',
    'Kuwait': 'KUW',
    'Zambia': 'ZAM',
    'Namibia': 'NAM',
    'Saudi Arabia': 'KSA',
    'Lebanon': 'LBN',
    'Zimbabwe': 'ZIM',
    'Costa Rica': 'CRC',
    'FR Yugoslavia': 'YUG',
    'Russian Empire': 'RUS',
    'Guatemala': 'GTM',
    'Afghanistan': 'AFG',
    'Cyprus': 'CYP',
    'Burundi': 'BDI',
    'Mozambique': 'MOZ',
    'Bohemia': 'BOH',
    'Australasia': 'AUS',
    'Haiti': 'HTI',
    'United Team of Germany': 'GDR',
    'Bermuda': 'BER',
    'Niger': 'NER',
    'United Arab Emirates': 'ARE',
    'Suriname': 'SUR',
    'Guyana': 'GUY',
    'Ceylon': 'CEY',
    'Formosa': 'TPE',
    'Iraq': 'IRQ',
    'British West Indies': 'BWI',
    'Unified Team': 'EUN',
    'Djibouti': 'DJI',
    'Virgin Islands': 'VIR',
    'Senegal': 'SEN',
    'Netherlands Antilles': 'ANT',
    'Taiwan': 'TPE',
    'Tanzania': 'TAN',
    'Independent Olympic Participants': 'IOP',
    'Macedonia': 'MKD',
    'Barbados': 'BRB',
    'Sri Lanka': 'LKA',
    'Tonga': 'TON',
    'Sudan': 'SDN',
    'Samoa': 'SAM',
    'Paraguay': 'PRY',
    'Eritrea': 'ERI',
    'Gabon': 'GAB',
    'Togo': 'TOG',
    'Mauritius': 'MRI',
    'Serbia and Montenegro': 'SCG',
    'Independent Olympic Athletes': 'IOA',
    'Montenegro': 'MNE',
    'ROC': 'ROC',
    'San Marino': 'SMR',
    'North Macedonia': 'MKD',
    'Turkmenistan': 'TKM',
    'Burkina Faso': 'BFA',
    'Saint Lucia': 'LCA',
    'Dominica': 'DMA',
    'Albania': 'ALB',
    'Cabo Verde': 'CPV',
    'Refugee Olympic Team': 'ROT'
}


In [None]:
# 创建 DataFrame，其中每个键值对作为一行
df = pd.DataFrame([(key, ', '.join(value) if isinstance(value, list) else value) for key, value in country_to_noc.items()], columns=['key', 'value'])

# 保存为CSV
df.to_csv('NOC_dict.csv', index=False)

print("CSV文件已保存")

## 2.2 未获奖国家参赛趋势

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
participation_by_year_country_without_medal.plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country without Medal', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=8, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=4)

plt.tight_layout()
plt.show()


## 最后5年趋势

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
participation_by_year_country_without_medal.tail(5).plot(kind='line', marker='o', figsize=(10, 6))

# 设置标题、标签和图例的字体大小
plt.title('Total Participation per Year by Country without Medal', fontsize=10)  # 设置标题字体大小
plt.xlabel('Year', fontsize=9)  # 设置 x 轴标签字体大小
plt.ylabel('Number of Participants', fontsize=9)  # 设置 y 轴标签字体大小
plt.xticks(rotation=45, fontsize=8)  # 设置 x 轴刻度字体大小
plt.yticks(fontsize=8)  # 设置 y 轴刻度字体大小

# 设置图例的字体大小
plt.legend(title='Country (NOC)', fontsize=8, title_fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left', ncol=4)

plt.tight_layout()
plt.show()


## 2.3 获取突然获奖前5届人员的趋势

In [None]:
medals_by_year

## 随机森林 -> 集成学习

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

# 假设 tree_dataset 是已经准备好的 DataFrame
# 这里需要确保你的 DataFrame 已经加载

# Step 1: 准备特征和目标变量
X = tree_dataset[['1', '2', '3', '4', '5']]  # 特征列（前5年的数据）
y = tree_dataset['will have medal']  # 目标变量列（是否获得奖牌）

# Step 2: 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter grid for RandomForestClassifier
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Hyperparameter grid for SVC
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

# Hyperparameter grid for LogisticRegression
lr_param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
}

# Step 4: Perform GridSearchCV for RandomForestClassifier
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
print("Best RandomForest Parameters:", rf_grid_search.best_params_)

# Step 5: Perform GridSearchCV for SVC
svc_grid_search = GridSearchCV(SVC(probability=True, random_state=42), svc_param_grid, cv=5, n_jobs=-1)
svc_grid_search.fit(X_train, y_train)
print("Best SVC Parameters:", svc_grid_search.best_params_)

# Step 6: Perform GridSearchCV for LogisticRegression
lr_grid_search = GridSearchCV(LogisticRegression(random_state=42), lr_param_grid, cv=5, n_jobs=-1)
lr_grid_search.fit(X_train, y_train)
print("Best LogisticRegression Parameters:", lr_grid_search.best_params_)

# Step 7: Create the ensemble model with tuned classifiers
ensemble_model_tuned = VotingClassifier(estimators=[
    ('rf', rf_grid_search.best_estimator_),
    ('svc', svc_grid_search.best_estimator_),
    ('lr', lr_grid_search.best_estimator_)
], voting='soft')

# Step 8: Train and evaluate the ensemble model
ensemble_model_tuned.fit(X_train, y_train)

# Calibrate the ensemble model
calibrated_model_tuned = CalibratedClassifierCV(ensemble_model_tuned, method='sigmoid', cv='prefit')
calibrated_model_tuned.fit(X_test, y_test)

# Step 9: Evaluate performance
train_accuracy_tuned = calibrated_model_tuned.score(X_train, y_train)
test_accuracy_tuned = calibrated_model_tuned.score(X_test, y_test)

print(f"Tuned ensemble model training set accuracy: {train_accuracy_tuned:.4f}")
print(f"Tuned ensemble model testing set accuracy: {test_accuracy_tuned:.4f}")

# Step 10: 计算并打印混淆矩阵
y_pred = calibrated_model_tuned.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

# Step 11: 绘制混淆矩阵热力图
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Medal', 'Medal'], yticklabels=['No Medal', 'Medal'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Step 12: 绘制 ROC 曲线
fpr, tpr, thresholds = roc_curve(y_test, calibrated_model_tuned.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 假设 participation_by_year_country_without_medal 是你的 DataFrame
# 假设 calibrated_model_tuned 是经过训练的最佳模型

# Step 1: 存储每列的预测概率
predicted_probabilities_all_columns = []

# Step 2: 遍历每一列，对列中的5个数值进行预测
for column in participation_by_year_country_without_medal.columns:
    # 提取当前列的最后5个值
    last_5_values = participation_by_year_country_without_medal[column].tail(5).values.reshape(1, -1)  # Reshaping to (1, 5)
    print(last_5_values)
    
    # 预测概率（假设best_model已经训练好）
    probabilities = calibrated_model_tuned.predict_proba(last_5_values)
    
    # 获取预测为1（获得奖牌）的概率
    predicted_probabilities = probabilities[:, 1]
    
    # 将预测概率存储到列表中
    predicted_probabilities_all_columns.append(predicted_probabilities[0])  # 取出每列的第一个预测值（因为我们只预测了一个数）

print("Predicted Probabilities (before ranking normalization):", predicted_probabilities_all_columns)

# Step 3: 对预测概率进行排名并归一化
# 获取排序后的索引
ranked_probabilities = np.argsort(predicted_probabilities_all_columns)

# 将排名转换为 [0, 1] 范围内的归一化值
normalized_ranked_probabilities = np.linspace(0, 1, len(predicted_probabilities_all_columns))

# 将归一化值按预测概率的顺序重新排列
normalized_probabilities = [normalized_ranked_probabilities[ranked_probabilities.tolist().index(i)] for i in range(len(predicted_probabilities_all_columns))]

print("Normalized Ranked Probabilities:", normalized_probabilities)

# Step 4: 绘制柱状图
plt.figure(figsize=(10, 6))
plt.bar(participation_by_year_country_without_medal.columns, normalized_probabilities)
plt.xlabel('Country')
plt.ylabel('Normalized Ranked Predicted Probability')
plt.title('Normalized Ranked Predicted Probabilities for Medal')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 多层感知机

In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# 假设 tree_dataset 是已经准备好的 DataFrame
# 这里需要确保你的 DataFrame 已经加载

# Step 1: 准备特征和目标变量
X = tree_dataset[['1', '2', '3', '4', '5']]  # 特征列（前5年的数据）
y = tree_dataset['will have medal']  # 目标变量列（是否获得奖牌）

# Step 2: 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: 调整神经网络的参数，尝试不同的超参数
param_grid = {
    'hidden_layer_sizes': [(100,), (150,), (100, 100), (50, 50)],  # 尝试不同的隐藏层结构
    'max_iter': [1000, 2000, 3000],  # 尝试不同的最大迭代次数
    'learning_rate_init': [0.001, 0.0005, 0.01],  # 不同的学习率
    'alpha': [0.0001, 0.001, 0.01],  # 正则化参数
    'momentum': [0.9, 0.95, 0.99],  # 动量值
    'early_stopping': [True],  # 启用早停
    'random_state': [42]
}

# Step 5: 使用 GridSearchCV 进行超参数搜索
grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# 获取最佳参数
best_params = grid_search.best_params_
print(f"最佳超参数: {best_params}")

# 使用最佳超参数训练的模型
best_model = grid_search.best_estimator_

# Step 6: 输出训练集和测试集的准确率
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)

print(f"训练集准确率: {train_accuracy:.4f}")
print(f"测试集准确率: {test_accuracy:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 假设 participation_by_year_country_without_medal 是你的 DataFrame
# 假设 calibrated_model_tuned 是经过训练的最佳模型

# Step 1: 存储每列的预测概率
predicted_probabilities_all_columns = []

# Step 2: 遍历每一列，对列中的5个数值进行预测
for column in participation_by_year_country_without_medal.columns:
    # 提取当前列的最后5个值
    last_5_values = participation_by_year_country_without_medal[column].tail(5).values.reshape(1, -1)  # Reshaping to (1, 5)
    print(last_5_values)
    
    # 预测概率（假设best_model已经训练好）
    probabilities = best_model.predict_proba(last_5_values)
    
    # 获取预测为1（获得奖牌）的概率
    predicted_probabilities = probabilities[:, 1]
    
    # 将预测概率存储到列表中
    predicted_probabilities_all_columns.append(predicted_probabilities[0])  # 取出每列的第一个预测值（因为我们只预测了一个数）

print("Predicted Probabilities (before ranking normalization):", predicted_probabilities_all_columns)

# Step 3: 对预测概率进行排名并归一化
# 获取排序后的索引
ranked_probabilities = np.argsort(predicted_probabilities_all_columns)

# 将排名转换为 [0, 1] 范围内的归一化值
normalized_ranked_probabilities = np.linspace(0, 1, len(predicted_probabilities_all_columns))

# 将归一化值按预测概率的顺序重新排列
normalized_probabilities = [normalized_ranked_probabilities[ranked_probabilities.tolist().index(i)] for i in range(len(predicted_probabilities_all_columns))]

print("Normalized Ranked Probabilities:", normalized_probabilities)

# Step 4: 绘制柱状图
plt.figure(figsize=(10, 6))
plt.bar(participation_by_year_country_without_medal.columns, normalized_probabilities)
plt.xlabel('Country')
plt.ylabel('Normalized Ranked Predicted Probability')
plt.title('Normalized Ranked Predicted Probabilities for Medal')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
