In [1]:
import csv
import os
import numpy as np
import pandas as pd


In [2]:
dataset_path = '../data'
report_2015_datafile = os.path.join(dataset_path, '2015.csv')
report_2016_datafile = os.path.join(dataset_path, '2016.csv')

In [3]:
# 读入数据
def load_data(data_file):
    """
        读取数据文件，加载数据。
        返回列表，其中列表中的每个元素为一个元组，包括Country, Region, Happiness Rank和Happiness Score
    """
    data = []
    with open(data_file, 'r') as csvfile:
        data_reader = csv.DictReader(csvfile)
        for row in data_reader:
            # 取出每行数据的Country, Region, Happiness Rank和Hapiness Score，组合为一个元组放入数据列表中
            # 注意csv模块读入的数据全部为字符串类型
            data.append((row['Country'], row['Region'], 
                         row['Happiness Rank'], row['Happiness Score']))
    return data


report_2015_data = load_data(report_2015_datafile)
report_2016_data = load_data(report_2016_datafile)

In [4]:
print('2015年报告，前10条记录预览：')
print(report_2015_data[:10])

print('2016年报告，前10条记录预览：')
print(report_2016_data[:10])

2015年报告，前10条记录预览：
[('Switzerland', 'Western Europe', '1', '7.587'), ('Iceland', 'Western Europe', '2', '7.561'), ('Denmark', 'Western Europe', '3', '7.527'), ('Norway', 'Western Europe', '4', '7.522'), ('Canada', 'North America', '5', '7.427'), ('Finland', 'Western Europe', '6', '7.406'), ('Netherlands', 'Western Europe', '7', '7.378'), ('Sweden', 'Western Europe', '8', '7.364'), ('New Zealand', 'Australia and New Zealand', '9', '7.286'), ('Australia', 'Australia and New Zealand', '10', '7.284')]
2016年报告，前10条记录预览：
[('Denmark', 'Western Europe', '1', '7.526'), ('Switzerland', 'Western Europe', '2', '7.509'), ('Iceland', 'Western Europe', '3', '7.501'), ('Norway', 'Western Europe', '4', '7.498'), ('Finland', 'Western Europe', '5', '7.413'), ('Canada', 'North America', '6', '7.404'), ('Netherlands', 'Western Europe', '7', '7.339'), ('New Zealand', 'Australia and New Zealand', '8', '7.334'), ('Australia', 'Australia and New Zealand', '9', '7.313'), ('Sweden', 'Western Europe', '10', '7.291

In [9]:
# 注意列表推导式的使用
happiness_2015_scores = [float(item[3]) for item in report_2015_data]
happiness_2016_scores = [float(item[3]) for item in report_2016_data]

# 查看数据
print('2015年报告，前10条记录幸福指数：', happiness_2015_scores[:10])
print('2016年报告，前10条记录幸福指数：', happiness_2016_scores[:10])

('2015\xe5\xb9\xb4\xe6\x8a\xa5\xe5\x91\x8a\xef\xbc\x8c\xe5\x89\x8d10\xe6\x9d\xa1\xe8\xae\xb0\xe5\xbd\x95\xe5\xb9\xb8\xe7\xa6\x8f\xe6\x8c\x87\xe6\x95\xb0\xef\xbc\x9a', [7.587, 7.561, 7.527, 7.522, 7.427, 7.406, 7.378, 7.364, 7.286, 7.284])
('2016\xe5\xb9\xb4\xe6\x8a\xa5\xe5\x91\x8a\xef\xbc\x8c\xe5\x89\x8d10\xe6\x9d\xa1\xe8\xae\xb0\xe5\xbd\x95\xe5\xb9\xb8\xe7\xa6\x8f\xe6\x8c\x87\xe6\x95\xb0\xef\xbc\x9a', [7.526, 7.509, 7.501, 7.498, 7.413, 7.404, 7.339, 7.334, 7.313, 7.291])


In [11]:
hist_2015, hist_edge_2015 = np.histogram(happiness_2015_scores)
hist_2016, hist_edge_2016 = np.histogram(happiness_2016_scores)

print('2015年报告，幸福指数直方图分布：{}；直方图边界：{}。'.format(hist_2015, hist_edge_2015))
print('2016年报告，幸福指数直方图分布：{}；直方图边界：{}。'.format(hist_2016, hist_edge_2016))

2015年报告，幸福指数直方图分布：[ 3 10 13 25 28 12 26 14 12 15]；直方图边界：[ 2.839   3.3138  3.7886  4.2634  4.7382  5.213   5.6878  6.1626  6.6374
  7.1122  7.587 ]。
2016年报告，幸福指数直方图分布：[ 4 10 20 15 25 19 21 17 12 14]；直方图边界：[ 2.905   3.3671  3.8292  4.2913  4.7534  5.2155  5.6776  6.1397  6.6018
  7.0639  7.526 ]。


In [14]:
def get_region_happiness_scores(report_data):
    """
        获取区域包含国家的幸福指数
    """
    region_score_dict = {}
    for item in report_data:
        region = item[1]
        score = float(item[3])
        if region in region_score_dict:
            # 如果region_score_dict已经记录了该区域，则添加该区域的幸福指数到列表中
            region_score_dict[region].append(score)
        else:
            # 如果region_score_dict未记录该区域，则为该区域初始化一个空列表
            region_score_dict[region] = []
    return region_score_dict
     

region_2015_score_dict = get_region_happiness_scores(report_2015_data)
region_2016_score_dict = get_region_happiness_scores(report_2015_data)
# print(region_2015_score_dict)
# 遍历数据字典，以2015年为例
print('2015报告：')
for region, scores in region_2015_score_dict.items():
    print('{}：最大值{}，最小值{}，平均值{}，中间值{}'.format(region,
        np.max(scores), np.min(scores), np.mean(scores), np.median(scores)))

2015报告：
Eastern Asia：最大值5.987，最小值4.874，平均值5.4918，中间值5.474
Latin America and Caribbean：最大值7.187，最小值4.518，平均值6.09319047619，中间值6.13
Central and Eastern Europe：最大值6.003，最小值4.218，平均值5.29107142857，中间值5.249
North America：最大值7.119，最小值7.119，平均值7.119，中间值7.119
Middle East and Northern Africa：最大值6.901，最小值3.006，平均值5.30842105263，中间值5.192
Sub-Saharan Africa：最大值5.268，最小值2.839，平均值4.17012820513，中间值4.252
Southeastern Asia：最大值6.455，最小值3.819，平均值5.132375，中间值5.2165
Western Europe：最大值7.561，最小值4.857，平均值6.64475，中间值6.902
Southern Asia：最大值5.194，最小值3.575，平均值4.46883333333，中间值4.5395
Australia and New Zealand：最大值7.284，最小值7.284，平均值7.284，中间值7.284


In [15]:
# 将数据构建成字典，key是国家，value是其排名
# 扩展：字典推导式
country_2015_score_dict = {item[0] : int(item[2]) for item in report_2015_data}
country_2016_score_dict = {item[0] : int(item[2]) for item in report_2016_data}

# 2015年数据预览
print(country_2015_score_dict)

{'Canada': 5, 'Turkmenistan': 70, 'Lithuania': 56, 'Cambodia': 145, 'Ethiopia': 122, 'Sri Lanka': 132, 'Swaziland': 101, 'Argentina': 30, 'Bolivia': 51, 'Cameroon': 133, 'Burkina Faso': 152, 'Bahrain': 49, 'Saudi Arabia': 35, 'Slovenia': 55, 'Guatemala': 43, 'Zimbabwe': 115, 'Bosnia and Herzegovina': 96, 'Guinea': 150, 'Germany': 26, 'Spain': 36, 'Liberia': 116, 'Netherlands': 7, 'Jamaica': 65, 'Oman': 22, 'Tanzania': 146, 'Ivory Coast': 151, 'Gabon': 143, 'New Zealand': 9, 'Yemen': 136, 'Pakistan': 81, 'Albania': 95, 'United Arab Emirates': 20, 'Uruguay': 32, 'India': 117, 'Azerbaijan': 80, 'Madagascar': 147, 'Lesotho': 97, 'Congo (Brazzaville)': 139, 'Kenya': 125, 'South Korea': 47, 'Tajikistan': 106, 'Turkey': 76, 'Afghanistan': 153, 'Czech Republic': 31, 'Mongolia': 100, 'France': 29, 'Rwanda': 154, 'Slovakia': 45, 'Peru': 58, 'Laos': 99, 'Norway': 4, 'Malawi': 131, 'Benin': 155, 'Singapore': 24, 'Montenegro': 82, 'Togo': 158, 'China': 84, 'Armenia': 127, 'Dominican Republic': 98, 

In [16]:
# 将数据转换为Series
ser_2015 = pd.Series(country_2015_score_dict)
ser_2016 = pd.Series(country_2016_score_dict)

print('2015年，数据预览：')
print(ser_2015.head())

print('2016年，数据预览：')
print(ser_2016.head())

2015年，数据预览：
Afghanistan    153
Albania         95
Algeria         68
Angola         137
Argentina       30
dtype: int64
2016年，数据预览：
Afghanistan    154
Albania        109
Algeria         38
Angola         141
Argentina       26
dtype: int64


In [17]:
ser_change = ser_2016 - ser_2015
print('2015-2016排名变化：')
print(ser_change)

2015-2016排名变化：
Afghanistan                  1.0
Albania                     14.0
Algeria                    -30.0
Angola                       4.0
Argentina                   -4.0
Armenia                     -6.0
Australia                   -1.0
Austria                     -1.0
Azerbaijan                   1.0
Bahrain                     -7.0
Bangladesh                   1.0
Belarus                      2.0
Belgium                     -1.0
Belize                       NaN
Benin                       -2.0
Bhutan                       5.0
Bolivia                      8.0
Bosnia and Herzegovina      -9.0
Botswana                     9.0
Brazil                       1.0
Bulgaria                    -5.0
Burkina Faso                -7.0
Burundi                      0.0
Cambodia                    -5.0
Cameroon                   -19.0
Canada                       1.0
Central African Republic     NaN
Chad                        -5.0
Chile                       -3.0
China                       

In [19]:
# 查看上升最快的国家
print('2015-2016幸福指数上升最快的国家', ser_change.argmax())
# 查看下降最快的国家
print('2015-2016幸福指数下降最快的国家', ser_change.argmin())

Liberia
Algeria
