In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load the OECD tourism data (from `oecd_tourism.oecd`) into a data frame.

tourism_filename = '../data/oecd_tourism.csv'
tourism_df = pd.read_csv(tourism_filename, 
                        usecols=['LOCATION', 'SUBJECT', 'TIME', 'Value'])

tourism_df.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,31159.8
1,AUS,INT_REC,2009,29980.7
2,AUS,INT_REC,2010,35165.5
3,AUS,INT_REC,2011,38710.1
4,AUS,INT_REC,2012,38003.7


In [3]:
# Find the five countries that received the greatest amount of tourist dollars,
# on average, in the data set.
(
    tourism_df
    .loc[tourism_df['SUBJECT'] == 'INT_REC']
    .groupby('LOCATION')['Value']
    .mean()
    .sort_values(ascending=False)
    .head()
)

LOCATION
USA    201613.500000
ESP     69655.817364
FRA     65063.335727
DEU     53408.570636
GBR     51752.090909
Name: Value, dtype: float64

In [4]:
# Find the five countries whose citizens spent the least amount of tourist dollars, 
# on average, in the data set.
(
    tourism_df
    .loc[tourism_df['SUBJECT'] == 'INT-EXP']
    .groupby('LOCATION')['Value']
    .mean()
    .sort_values()
    .head()
)

LOCATION
MLT     387.801667
CRI     867.075000
LVA     919.545455
ISL    1072.819636
HRV    1115.628083
Name: Value, dtype: float64

In [5]:
# I've created a separate CSV file, `oecd_locations.csv`, with two columns. 
# One contains the three-letter abbreviated location name you saw in the first CSV 
# file. The second is the full country name. Load this into a data frame, 
# using the abbreviated data as an index.

locations_filename = '../data/oecd_locations.csv'
locations_df = pd.read_csv(locations_filename,
                          header=None,
                           names=['LOCATION', 'NAME'],
                          index_col='LOCATION')

locations_df.head()

Unnamed: 0_level_0,NAME
LOCATION,Unnamed: 1_level_1
AUS,Australia
AUT,Austria
BEL,Belgium
CAN,Canada
DNK,Denmark


In [6]:
# Join these two data frames together into a new one, In the new data frame,
# there will be no `LOCATION` column. Instead, there will be a `NAME` column, 
# with the full name of the country.

fullname_df = locations_df.join(tourism_df.set_index('LOCATION'))
fullname_df.head()

Unnamed: 0_level_0,NAME,SUBJECT,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,Australia,INT_REC,2008,31159.8
AUS,Australia,INT_REC,2009,29980.7
AUS,Australia,INT_REC,2010,35165.5
AUS,Australia,INT_REC,2011,38710.1
AUS,Australia,INT_REC,2012,38003.7


In [8]:
(
    fullname_df
    .loc[fullname_df['SUBJECT'] == 'INT_REC']
    .groupby('NAME')['Value']
    .mean()
    .sort_values(ascending=False)
    .head()
)

NAME
United States     201613.500000
France             65063.335727
Germany            53408.570636
United Kingdom     51752.090909
Italy              44930.211545
Name: Value, dtype: float64

In [8]:
(
    fullname_df
    .loc[fullname_df['SUBJECT'] == 'INT_REC']
    .groupby('NAME')['Value']
    .mean()
    .sort_values()
    .head()
)

NAME
Finland    4700.236273
Brazil     6321.476083
Israel     6542.383250
Hungary    7299.353000
Denmark    9398.957636
Name: Value, dtype: float64

In [9]:
locations_df.shape

(16, 1)