## 归一化

### （1）极值归一化

In [19]:
import pandas as pd

# 读取 GDP 数据
gdp_data_path = 'data1/人均GDP_Data.csv'
gdp_data = pd.read_csv(gdp_data_path)

# 定义归一化函数
def normalize_gdp(group):
    """
    对每组（每年）的 GDP 列进行归一化处理。
    归一化公式：(x - x_min) / (x_max - x_min)
    """
    group['Normalized_GDP'] = (group['GDP'] - group['GDP'].min()) / (group['GDP'].max() - group['GDP'].min())
    return group

# 按年份分组并应用归一化函数
normalized_gdp_data = gdp_data.groupby("Year").apply(normalize_gdp)

# 保存归一化后的数据到新的 CSV 文件
output_path = 'data1/人均GDP_Data_Normalized_1.csv'
normalized_gdp_data.to_csv(output_path, index=False)

# 显示保存路径
print(f"归一化后的数据已保存到: {output_path}")


归一化后的数据已保存到: data1/人均GDP_Data_Normalized_1.csv


  normalized_gdp_data = gdp_data.groupby("Year").apply(normalize_gdp)


### （2）百分比归一化

In [None]:
# import pandas as pd

# 读取 GDP 数据
gdp_data_path = 'data1/人均GDP_Data.csv'
gdp_data = pd.read_csv(gdp_data_path)

# 定义按全球百分比归一化函数
def normalize_gdp_by_percentage(group):
    """
    对每组（每年）的 GDP 数据计算百分比归一化
    计算公式：百分比 = (国家GDP / 全球GDP总和) * 100
    """
    total_gdp = group['GDP'].sum()  # 计算全球GDP总和
    group['Normalized_GDP_Percentage'] = (group['GDP'] / total_gdp) * 100  # 计算百分比
    return group

# 按年份分组并应用归一化函数
normalized_gdp_data = gdp_data.groupby("Year").apply(normalize_gdp_by_percentage)

# 保存归一化后的数据到新的 CSV 文件
output_path = 'data1/人均GDP_Data_Normalized_2.csv'
normalized_gdp_data.to_csv(output_path, index=False)

# 显示保存路径
print(f"按百分比归一化的GDP数据已保存到: {output_path}")


### 统计人口的

In [29]:
import pandas as pd

# 读取 population 数据
population_data_path = 'data1/Population_Data.csv'
population_data = pd.read_csv(population_data_path)

# 定义归一化函数
def normalize_population(group):
    
    group['Normalized_Population'] = (group['Population'] - group['Population'].min()) / (group['Population'].max() - group['Population'].min())
    return group
    
# 定义按全球百分比归一化函数
def normalize_population_by_percentage(group):
  
    total_population = group['Population'].sum()  # 计算全球population总和
    group['Normalized_Population_Percentage'] = (group['Population'] / total_population) * 100  # 计算百分比
    return group

# 按年份分组并应用归一化函数
normalized_population_data = population_data.groupby("Year").apply(normalize_population)

# 保存归一化后的数据到新的 CSV 文件
output_path = 'data1/Population_Data_Normalized_1.csv'
normalized_population_data.to_csv(output_path, index=False)

# 显示保存路径
print(f"归一化后的数据已保存到: {output_path}")

#------------------------------------------------------------------------
# 按年份分组并应用归一化函数
normalized_population_data = population_data.groupby("Year").apply(normalize_population_by_percentage)

# 保存归一化后的数据到新的 CSV 文件
output_path = 'data1/Population_Data_Normalized_2.csv'
normalized_population_data.to_csv(output_path, index=False)

# 显示保存路径
print(f"按百分比归一化的population数据已保存到: {output_path}")



  normalized_population_data = population_data.groupby("Year").apply(normalize_population)
  normalized_population_data = population_data.groupby("Year").apply(normalize_population_by_percentage)


归一化后的数据已保存到: data1/Population_Data_Normalized_1.csv
按百分比归一化的population数据已保存到: data1/Population_Data_Normalized_2.csv


## 汇总数据

In [69]:
import pandas as pd

# 加载数据文件路径
gdp_data_path = 'data1/GDP_Data_Normalized_1.csv'
population_data_path = 'data1/Population_Data_Normalized_1.csv'
per_capita_gdp_data_path = 'data1/人均GDP_Data_Normalized_1.csv'
athletes_data_path = 'data/summerOly_athletes.csv'

# 读取各个数据文件
gdp_data = pd.read_csv(gdp_data_path)
population_data = pd.read_csv(population_data_path)
per_capita_gdp_data = pd.read_csv(per_capita_gdp_data_path)
athletes_data = pd.read_csv(athletes_data_path)

# 重命名列以便合并
gdp_data.rename(columns={"Country Name": "Country", "GDP": "GDP_Value"}, inplace=True)
population_data.rename(columns={"Country Name": "Country"}, inplace=True)
per_capita_gdp_data.rename(columns={"Normalized_GDP": "Normalized_Per_Capita_GDP","Country or Area": "Country"}, inplace=True)

# 统计运动员数据
athlete_stats = athletes_data.groupby(["Team", "Year"]).agg(
    participants=("Name", "count"),         # 参赛人数
    sport_count=("Sport", "nunique"),      # 不同运动项目的数量
    event_count=("Event", "nunique")       # 不同比赛项目的数量
).reset_index()

# 合并GDP和人口数据
gdp_population_data = pd.merge(gdp_data, population_data, on=["Country", "Year"], how="inner")

# 合并人均GDP数据
merged_data = pd.merge(gdp_population_data, per_capita_gdp_data, on=["Country", "Year"], how="left")

# 合并运动员统计数据
final_data = pd.merge(merged_data, athlete_stats, left_on=["Country", "Year"], right_on=["Team", "Year"], how="left")

# 选择需要保留的列
columns_to_keep = ["Country", "Year", "Normalized_GDP", "Normalized_Population", "Normalized_Per_Capita_GDP",
                   "participants", "sport_count", "event_count"]
final_data = final_data[columns_to_keep]

# 删除包含缺失值的行
final_data.dropna(inplace=True)

# 保存最终数据到 CSV 文件
output_path = 'data1/Aggregated_Olympic_Data.csv'
final_data.to_csv(output_path, index=False)

# 打印保存路径
print(f"最终清理后的数据已保存到: {output_path}")

# 可选：打印数据的前几行
print(final_data.head())


最终清理后的数据已保存到: data1/Aggregated_Olympic_Data.csv
        Country  Year  Normalized_GDP  Normalized_Population  \
1875  Argentina  1972        0.009055               0.006426   
1876  Australia  1972        0.013571               0.003435   
1877    Austria  1972        0.005727               0.001966   
1879    Belgium  1972        0.009701               0.002531   
1880      Benin  1972        0.000101               0.000835   

      Normalized_Per_Capita_GDP  participants  sport_count  event_count  
1875                   0.096561         123.0         12.0         62.0  
1876                   0.283118         266.0         20.0        110.0  
1877                   0.184300         143.0         15.0         68.0  
1879                   0.242709         106.0         14.0         51.0  
1880                   0.005313           3.0          2.0          3.0  


In [78]:
import pandas as pd

# 加载数据文件路径
gdp_data_path = 'data1/GDP_Data_Normalized_1.csv'
population_data_path = 'data1/Population_Data_Normalized_1.csv'
per_capita_gdp_data_path = 'data1/人均GDP_Data_Normalized_1.csv'
athletes_data_path = 'data/summerOly_athletes.csv'

# 读取各个数据文件
gdp_data = pd.read_csv(gdp_data_path)
population_data = pd.read_csv(population_data_path)
per_capita_gdp_data = pd.read_csv(per_capita_gdp_data_path)
athletes_data = pd.read_csv(athletes_data_path)

# 重命名列以便合并
gdp_data.rename(columns={"Country Name": "Country", "GDP": "GDP_Value"}, inplace=True)
population_data.rename(columns={"Country Name": "Country"}, inplace=True)
per_capita_gdp_data.rename(columns={"Normalized_GDP": "Normalized_Per_Capita_GDP", "Country or Area": "Country"}, inplace=True)

# 统计运动员数据
athlete_stats = athletes_data.groupby(["Team", "Year"]).agg(
    participants=("Name", "count"),         # 参赛人数
    sport_count=("Sport", "nunique"),      # 不同运动项目的数量
    event_count=("Event", "nunique")       # 不同比赛项目的数量
).reset_index()

# 合并GDP和人口数据
gdp_population_data = pd.merge(gdp_data, population_data, on=["Country", "Year"], how="inner")

# 合并人均GDP数据
merged_data = pd.merge(gdp_population_data, per_capita_gdp_data, on=["Country", "Year"], how="left")

# 合并运动员统计数据
final_data = pd.merge(merged_data, athlete_stats, left_on=["Country", "Year"], right_on=["Team", "Year"], how="left")

# --------------------- 奖牌统计功能 ---------------------

# 统计每届奥运会每个国家的奖牌数量
medal_stats = athletes_data[~athletes_data['Medal'].isna()].groupby(["Team", "Year", "Medal"]).agg(
    medal_count=("Medal", "count")
).reset_index()

# 将数据转换为透视表形式，按国家和年份显示每种奖牌的数量
medal_pivot = medal_stats.pivot(index=["Team", "Year"], columns="Medal", values="medal_count").fillna(0).reset_index()

# 重命名列名以便理解
medal_pivot.rename(columns={"Gold": "Gold_Medals", "Silver": "Silver_Medals", "Bronze": "Bronze_Medals"}, inplace=True)

# 计算每个国家每届奥运会的奖牌总数
medal_pivot["Total_Medals"] = medal_pivot[["Gold_Medals", "Silver_Medals", "Bronze_Medals"]].sum(axis=1)

# 将奖牌数据与 final_data 合并
final_data = pd.merge(final_data, medal_pivot, left_on=["Country", "Year"], right_on=["Team", "Year"], how="left")

# 填充缺失值为 0（如果某些国家或年份没有奖牌信息）
final_data[["Gold_Medals", "Silver_Medals", "Bronze_Medals", "Total_Medals"]] = final_data[
    ["Gold_Medals", "Silver_Medals", "Bronze_Medals", "Total_Medals"]
].fillna(0)

# 删除多余列（如 Team 和其他多余列）
if "Team" in final_data.columns:
    final_data.drop(columns=["Team"], inplace=True)

# 选择需要保留的列
columns_to_keep = ["Country", "Year", "Normalized_GDP", "Normalized_Population", "Normalized_Per_Capita_GDP",
                   "participants", "sport_count", "event_count", "Gold_Medals", "Silver_Medals", 
                   "Bronze_Medals", "Total_Medals"]
final_data = final_data[columns_to_keep]

# 删除包含其他缺失值的行
final_data.dropna(inplace=True)

# 保存最终数据到 CSV 文件
output_path = 'data1/final_data_0.csv'
final_data.to_csv(output_path, index=False)

# 打印保存路径
print(f"最终清理后的数据已保存到: {output_path}")

# 可选：打印数据的前几行
print(final_data.head())


最终清理后的数据已保存到: data1/Aggregated_Olympic_Data_with_Medals.csv
        Country  Year  Normalized_GDP  Normalized_Population  \
1875  Argentina  1972        0.009055               0.006426   
1876  Australia  1972        0.013571               0.003435   
1877    Austria  1972        0.005727               0.001966   
1879    Belgium  1972        0.009701               0.002531   
1880      Benin  1972        0.000101               0.000835   

      Normalized_Per_Capita_GDP  participants  sport_count  event_count  \
1875                   0.096561         123.0         12.0         62.0   
1876                   0.283118         266.0         20.0        110.0   
1877                   0.184300         143.0         15.0         68.0   
1879                   0.242709         106.0         14.0         51.0   
1880                   0.005313           3.0          2.0          3.0   

      Gold_Medals  Silver_Medals  Bronze_Medals  Total_Medals  
1875          0.0            1.0        