<a href="https://colab.research.google.com/github/rgmartin/greece_tourism_project/blob/main/Copy_of_greece_tourism_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Greece tourism detination analysis



## 1. Web scrapping excel files

In [1]:
! pip install plotly --upgrade



In [2]:
#####Step 1: start by importing all of the necessary packages#####
import requests #requesting URLs
import urllib.request #requesting URLs
import os
import time #setting the speed at which the requests run
import re #regexp string manipulation
import pandas as pd #for simplifying data operations (e.g. creating dataframe objects)
import matplotlib.pyplot as plt #for plotting the scraped data
from bs4 import BeautifulSoup #for web-scraping operations

In [3]:
#####Step 2: connect to the URL in question for scraping#####
url = 'https://insete.gr/perifereies/' 
response = requests.get(url) #Connect to the URL using the "requests" package
response #if successful then it will return 200
#####Step 3: read in the URL via the "BeautifulSoup" package#####
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup)
#####Step 4: filter the HTML object for all link objects######
#link_objs = soup.find_all('a', href=re.compile('/wp-content/uploads'))
excel_links = [link.get('href') for link in soup.find_all('a') if ('xlsx' in link.get('href')) ]

In [4]:
df_parameters= [
                          {'sheet_name': 6, 'header':[2,3],'index_col':[0,1]}

]

In [5]:
def extract_region_name(url):
  file_w_extension = os.path.basename(url)
  file = os.path.splitext(file_w_extension)[0]  
  return ''.join(x for x in file if x.isalpha())

In [6]:
def read_inbound_tourism(url):
  data = requests.get(url).content
  xl = pd.ExcelFile(data)
  parameters = {'sheet_name': 6, 'header':[3],'index_col':[0,1]}
  df = xl.parse(**parameters)
  series = df.loc['Σύνολο'].iloc[0]
  series.name = extract_region_name(url)
  return series

In [7]:
inbound_tourism_df = pd.concat([read_inbound_tourism(url) for url in excel_links], axis = 1)
inbound_tourism_df.index.name = 'Year'

In [8]:
inbound_tourism_df.index

Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020], dtype='int64', name='Year')

In [9]:
inbound_tourism_df.sort_values(axis=1, by=2020,ascending=False,inplace=True)
inbound_tourism_shares = inbound_tourism_df.div(inbound_tourism_df.sum(axis=1), axis=0)*100
inbound_tourism_shares

Unnamed: 0_level_0,Crete,Attica,IonianIslands,CentralMacedonia,SouthAegean,Peloponnese,Thessaly,EasternMacedoniaThrace,CentralGreece,WesternGreece,Epirus,NorthAegean,WesternMacedonia
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010,28.260664,27.00276,11.027154,11.641536,4.186582,4.278634,3.167388,1.594131,2.305475,3.513032,0.80728,1.936322,0.279043
2011,30.331411,25.424492,10.808777,11.862583,4.361663,3.957447,3.027026,1.780052,2.148323,3.180805,0.808558,2.008231,0.300633
2012,32.842341,22.338077,11.745774,13.605866,4.078317,3.24582,2.709035,2.007416,1.626473,2.608993,0.838479,2.058555,0.294854
2013,32.568554,22.518324,11.248665,13.480498,4.505238,2.916547,2.859138,2.420278,1.560248,2.557135,0.816664,2.272205,0.276506
2014,30.336098,25.405419,10.99124,12.653784,5.097656,3.209608,2.985335,2.413795,1.729735,1.948138,0.810574,2.229178,0.18944
2015,28.701265,25.171607,11.45937,12.944135,5.312086,3.131549,3.001081,2.603866,1.796053,2.508087,0.796729,2.417224,0.156948
2016,31.377831,23.809769,12.18671,12.594433,5.288574,3.272879,2.515891,2.529223,1.786255,1.850952,0.882655,1.768303,0.136526
2017,29.991054,23.850993,12.424858,12.115419,5.570214,3.813966,2.888588,2.433445,2.003214,1.900148,0.976197,1.878976,0.152926
2018,29.631345,21.649635,12.375822,11.636936,8.345736,4.154666,3.00382,2.187099,2.249706,1.838447,1.147458,1.633175,0.146155
2019,28.414317,21.487375,12.350683,11.649616,9.504552,4.071052,3.132644,2.215384,2.34578,1.827139,1.215154,1.637189,0.149115


In [12]:
# visualisation based on https://towardsdatascience.com/create-effective-data-visualizations-of-proportions-94b69ad34410
# ideas of visualisation https://www.politico.eu/article/europe-tourism-boom-time-overtourism-top-destinations/
import plotly.express as px
fig = px.bar(inbound_tourism_shares,x = inbound_tourism_shares.index, y =inbound_tourism_shares.columns)
fig.show()

In [None]:
url= 'https://insete.gr/wp-content/uploads/2020/05/21-04_Eastern_Macedonia__Thrace-3.xlsx'
data = requests.get(url).content
xl = pd.ExcelFile(data)
xl.sheet_names

## Relevant questions
* Has the Greece tourism industry been devastated by COVID restrictions?
* How the origin country distribution was affected by COVID?
  - Plot a graph of shares of visitors from different countries (see https://www.kaggle.com/ceshine/impact-of-chinese-tourism-ban-to-taiwan)
* Analysis of seasonality
* Draw insights on how tourism has changed from different countries over time.
* Use other world datasets to draw the possible reasons/factors affecting tourism.
* domestic arrivals vs international arrivals vs road arrivals

TODO: create function that is able to read any specific data frame set from the excel documents. 
- Compare the share of tourism for each region.!