### Set up environment and import data

In [6]:
import pandas as pd
import numpy as np

In [10]:
url = "./TravelPac 2009-2019 Labelled.xlsx"
df_travel = pd.read_excel(url, sheet_name=1)

### Clean data

#### Check for missing values

In [4]:
missing_data = df_travel.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Year
False    396196
Name: Year, dtype: int64

quarter
False    396196
Name: quarter, dtype: int64

ukos
False    396196
Name: ukos, dtype: int64

mode
False    396196
Name: mode, dtype: int64

country
False    396196
Name: country, dtype: int64

purpose
False    396196
Name: purpose, dtype: int64

package
False    396196
Name: package, dtype: int64

Age
False    396196
Name: Age, dtype: int64

Sex
False    394656
True       1540
Name: Sex, dtype: int64

duration
False    396196
Name: duration, dtype: int64

visits
False    383331
True      12865
Name: visits, dtype: int64

nights
False    383334
True      12862
Name: nights, dtype: int64

expend
False    396159
True         37
Name: expend, dtype: int64

sample
False    395627
True        569
Name: sample, dtype: int64



No missing data in key fields (Year, quarter, ukos, country)

Data missing in sample field, assume sample size of 1 where missing.

In [15]:
df_travel["sample"].replace(np.nan,1, inplace = True)
null_sample_data = df_travel.loc[df_travel['sample'].isnull()]
null_sample_data.head()

Unnamed: 0,Year,quarter,ukos,mode,country,purpose,package,Age,Sex,duration,visits,nights,expend,sample


#### Rename quarter field

In [16]:
df_travel.loc[df_travel["quarter"] == "Jan-Mar","quarter"] = 1
df_travel.loc[df_travel["quarter"] == "Apr-Jun","quarter"] = 2
df_travel.loc[df_travel["quarter"] == "Jul-Sep","quarter"] = 3
df_travel.loc[df_travel["quarter"] == "Oct-Dec","quarter"] = 4
df_travel["quarter"] = df_travel["quarter"].astype(int)
df_travel.dtypes

Year          int64
quarter       int64
ukos         object
mode         object
country      object
purpose      object
package      object
Age          object
Sex          object
duration     object
visits      float64
nights      float64
expend      float64
sample      float64
dtype: object

## Total By Country

#### Strip for key country data

In [22]:
df_travel_country = df_travel[["ukos","country","sample"]]
df_travel_country.head()

Unnamed: 0,ukos,country,sample
0,UK residents,Austria,3.0
1,UK residents,Austria,3.0
2,UK residents,Austria,3.0
3,UK residents,Austria,2.0
4,UK residents,Austria,1.0


#### Remove non-residents

In [23]:
df_travel_country.drop(df_travel_country.loc[df_travel_country["ukos"] != "UK residents"].index, inplace=True)
df_travel_country.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,ukos,country,sample
0,UK residents,Austria,3.0
1,UK residents,Austria,3.0
2,UK residents,Austria,3.0
3,UK residents,Austria,2.0
4,UK residents,Austria,1.0


In [24]:
df_travel_country["ukos"].unique()

array(['UK residents'], dtype=object)

In [25]:
df_travel_country = df_travel_country[["country", "sample"]]
df_travel_country.head()

Unnamed: 0,country,sample
0,Austria,3.0
1,Austria,3.0
2,Austria,3.0
3,Austria,2.0
4,Austria,1.0


#### Get country totals

In [32]:
df_travel_country_totals = df_travel_country.groupby("country").sum()
df_travel_country_totals.rename(columns={"sample":"total"}, inplace=True)
df_travel_country_totals.head()

Unnamed: 0_level_0,total
country,Unnamed: 1_level_1
Australia,5626.0
Austria,5928.0
Barbados,757.0
Belgium,16019.0
Bulgaria,4039.0


In [28]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/smbryar/miniconda2

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    archspec-0.1.1             |     pyh9f0ad1d_0          25 KB  conda-forge
    conda-4.8.4                |   py38h32f6830_1         3.0 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.1 MB

The following NEW packages will be INSTALLED:

  archspec           conda-forge/noarch::archspec-0.1.1-pyh9f0ad1d_0

The following packages will be UPDATED:

  conda                                4.8.3-py38h32f6830_2 --> 4.8.4-py38h32f6830_1



Downloading and Extracting Packages
conda-4.8.4          | 3.0 MB    | ##################################### | 100% 
archs

In [29]:
url = "./countries.csv"
df_countries = pd.read_csv(url)
df_countries = df_countries[["latitude","longitude","name"]]
df_countries.head()

Unnamed: 0,latitude,longitude,name
0,42.546245,1.601554,Andorra
1,23.424076,53.847818,United Arab Emirates
2,33.93911,67.709953,Afghanistan
3,17.060816,-61.796428,Antigua and Barbuda
4,18.220554,-63.068615,Anguilla


In [33]:
df_country_locations = df_travel_country_totals.join(df_countries.set_index("name"))
df_country_locations

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,5626.0,-25.274398,133.775136
Austria,5928.0,47.516231,14.550072
Barbados,757.0,13.193887,-59.543198
Belgium,16019.0,50.503887,4.469936
Bulgaria,4039.0,42.733883,25.485830
...,...,...,...
Thailand,4851.0,15.870032,100.992541
Tunisia,2124.0,33.886917,9.537499
Turkey,11091.0,38.963745,35.243322
USA,39871.0,,


#### Find countries without latitude and longitude

In [34]:
missing_data = df_country_locations[df_country_locations.isnull().any(axis=1)]
missing_data

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Channel Islands,88.0,,
China - Hong Kong,1861.0,,
China - Other,4501.0,,
Cyprus EU,5404.0,,
Cyprus Non EU,625.0,,
Irish Republic,16203.0,,
Other Africa,8468.0,,
Other Asia,9502.0,,
Other Caribbean,3617.0,,
Other Central & Sth.America,3090.0,,


#### Alter names of countries then re-join

In [40]:
country_list = df_travel_country_totals.index.tolist()
idx = country_list.index("USA")
country_list[idx] = "United States"
idx = country_list.index("Irish Republic")
country_list[idx] = "Ireland"
idx = country_list.index("Cyprus EU")
country_list[idx] = "Cyprus"
idx = country_list.index("Cyprus Non EU")
country_list[idx] = "Cyprus"
idx = country_list.index("China - Other")
country_list[idx] = "China"
idx = country_list.index("China - Hong Kong")
country_list[idx] = "Hong Kong"
df_travel_country_totals.index = country_list

In [41]:
df_country_locations = df_travel_country_totals.join(df_countries.set_index("name"))
df_country_locations.index.name = "country"
df_country_locations

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,5626.0,-25.274398,133.775136
Austria,5928.0,47.516231,14.550072
Barbados,757.0,13.193887,-59.543198
Belgium,16019.0,50.503887,4.469936
Bulgaria,4039.0,42.733883,25.485830
...,...,...,...
Thailand,4851.0,15.870032,100.992541
Tunisia,2124.0,33.886917,9.537499
Turkey,11091.0,38.963745,35.243322
United Arab Emirates,8513.0,23.424076,53.847818


In [42]:
missing_data = df_country_locations[df_country_locations.isnull().any(axis=1)]
missing_data

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Channel Islands,88.0,,
Other Africa,8468.0,,
Other Asia,9502.0,,
Other Caribbean,3617.0,,
Other Central & Sth.America,3090.0,,
Other Europe,3487.0,,
Other Middle East,5342.0,,
Other North Africa,3223.0,,
Other countries,6251.0,,


In [43]:
df_country_locations.at["Channel Islands","latitude"] = 49.372284
df_country_locations.at["Channel Islands","longitude"] =  -2.364351
df_country_locations.at["Other Africa","latitude"] =  -8.7832
df_country_locations.at["Other Africa","longitude"] =  34.5085
df_country_locations.at["Other Asia","latitude"] =  34.0479
df_country_locations.at["Other Asia","longitude"] =  100.6197
df_country_locations.at["Other Caribbean","latitude"] =  21.4691
df_country_locations.at["Other Caribbean","longitude"] =  -78.6569
df_country_locations.at["Other Central & Sth.America","latitude"] =  12.7690
df_country_locations.at["Other Central & Sth.America","longitude"] =  -85.6024
df_country_locations.at["Other Europe","latitude"] =  54.5260
df_country_locations.at["Other Europe","longitude"] =  15.2551
df_country_locations.at["Other Middle East","latitude"] =  29.2985
df_country_locations.at["Other Middle East","longitude"] =  42.5510
df_country_locations.at["Other North Africa","latitude"] =  26.0198
df_country_locations.at["Other North Africa","longitude"] =  32.2778

In [44]:
df_country_locations.drop(["Other countries"], inplace=True)
df_country_locations.tail()

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thailand,4851.0,15.870032,100.992541
Tunisia,2124.0,33.886917,9.537499
Turkey,11091.0,38.963745,35.243322
United Arab Emirates,8513.0,23.424076,53.847818
United States,39871.0,37.09024,-95.712891


In [45]:
missing_data = df_country_locations[df_country_locations.isnull().any(axis=1)]
missing_data

Unnamed: 0_level_0,total,latitude,longitude
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Create Map

In [46]:
UK_travel_map = folium.Map(location=[55.3781, -3.4360],zoom_start=2)
UK_travel_map

In [47]:
travel_totals = folium.map.FeatureGroup()

for lat, lng, label in zip(df_country_locations.latitude, df_country_locations.longitude, df_country_locations.index + ": " + df_country_locations.total.map(str)):
    travel_totals.add_child(
        folium.features.CircleMarker(
            [lat,lng],
            radius=5,
            color="yellow",
            fill=True,
            fill_color="blue",
            fill_opacity = 0.5,
            popup=label
        )
    )
UK_travel_map.add_child(travel_totals)