In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
from sagemaker import get_execution_role
import matplotlib.pyplot as plt
import boto3

role = get_execution_role()

# Google-Covid-19 Dataset (hieu)

### 1a.) Prelimary Analysis

In [53]:
# load data from S3

input_bucket = 'wwcode-google-covid19-mobilityreports/google-covid-19-reports/dataset'
in_data_key = 'google-covid-19-reports.csv'
training_data_location = 's3://{}/{}'.format(input_bucket, in_data_key)

mobility_df = pd.read_csv(training_data_location)
print('uploaded training data from location: {}'.format(training_data_location))

  interactivity=interactivity, compiler=compiler, result=result)


uploaded training data from location: s3://wwcode-google-covid19-mobilityreports/google-covid-19-reports/dataset/google-covid-19-reports.csv


In [54]:
mobility_df

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,,,,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,,,,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,,,,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,,,,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556213,ZW,Zimbabwe,Midlands Province,Kwekwe,,,,2020-07-27,,,,,-4.0,
1556214,ZW,Zimbabwe,Midlands Province,Kwekwe,,,,2020-07-28,,,,,0.0,
1556215,ZW,Zimbabwe,Midlands Province,Kwekwe,,,,2020-07-29,,,,,-18.0,
1556216,ZW,Zimbabwe,Midlands Province,Kwekwe,,,,2020-07-30,,,,,-12.0,


In [70]:
us_mobility_df = mobility_df.loc[mobility_df['country_region'] == 'United States']
us_necessary_columns = us_mobility_df.drop(columns=['iso_3166_2_code','census_fips_code', 'sub_region_2', 
                                'metro_area', 'country_region_code'])
us_necessary_columns

Unnamed: 0,country_region,sub_region_1,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1089802,United States,,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1089803,United States,,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0
1089804,United States,,2020-02-17,6.0,0.0,28.0,-9.0,-24.0,5.0
1089805,United States,,2020-02-18,0.0,-1.0,6.0,1.0,0.0,1.0
1089806,United States,,2020-02-19,2.0,0.0,8.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
1536532,United States,Wyoming,2020-07-27,,,,,-29.0,
1536533,United States,Wyoming,2020-07-28,,,,,-24.0,
1536534,United States,Wyoming,2020-07-29,,,,,-29.0,
1536535,United States,Wyoming,2020-07-30,,,,,-29.0,


In [85]:
us_na = us_necessary_columns.dropna()
us_na

Unnamed: 0,country_region,sub_region_1,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1089970,United States,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1089971,United States,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
1089972,United States,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
1089973,United States,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
1089974,United States,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
1535172,United States,Wyoming,2020-07-22,6.0,23.0,111.0,2.0,-26.0,5.0
1535173,United States,Wyoming,2020-07-23,7.0,19.0,108.0,17.0,-26.0,5.0
1535174,United States,Wyoming,2020-07-24,-3.0,15.0,116.0,6.0,-28.0,5.0
1535179,United States,Wyoming,2020-07-29,5.0,10.0,96.0,10.0,-27.0,5.0


In [87]:
us_na.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84080 entries, 1089970 to 1535181
Data columns (total 9 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   country_region                                      84080 non-null  object 
 1   sub_region_1                                        84080 non-null  object 
 2   date                                                84080 non-null  object 
 3   retail_and_recreation_percent_change_from_baseline  84080 non-null  float64
 4   grocery_and_pharmacy_percent_change_from_baseline   84080 non-null  float64
 5   parks_percent_change_from_baseline                  84080 non-null  float64
 6   transit_stations_percent_change_from_baseline       84080 non-null  float64
 7   workplaces_percent_change_from_baseline             84080 non-null  float64
 8   residential_percent_change_from_baseline            84080 non-null  f

#### Baseline changes

In [97]:
us_dataset = us_na.reset_index()

us_dataset.rename(columns={"retail_and_recreation_percent_change_from_baseline":"retail_and_recreation_per_chg", "grocery_and_pharmacy_percent_change_from_baseline": "grocery_and_pharmacy_per_chg", "parks_percent_change_from_baseline": "parks_per_chg","transit_stations_percent_change_from_baseline": "transit_stations_per_chg", "workplaces_percent_change_from_baseline":"workplaces_per_chg","residential_percent_change_from_baseline":"residential_per_chg"},errors = "raise")

Unnamed: 0,index,country_region,sub_region_1,date,retail_and_recreation_per_chg,grocery_and_pharmacy_per_chg,parks_per_chg,transit_stations_per_chg,workplaces_per_chg,residential_per_chg
0,1089970,United States,Alabama,2020-02-15,5.0,2.0,39.0,7.0,2.0,-1.0
1,1089971,United States,Alabama,2020-02-16,0.0,-2.0,-7.0,3.0,-1.0,1.0
2,1089972,United States,Alabama,2020-02-17,3.0,0.0,17.0,7.0,-17.0,4.0
3,1089973,United States,Alabama,2020-02-18,-4.0,-3.0,-11.0,-1.0,1.0,2.0
4,1089974,United States,Alabama,2020-02-19,4.0,1.0,6.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
84075,1535172,United States,Wyoming,2020-07-22,6.0,23.0,111.0,2.0,-26.0,5.0
84076,1535173,United States,Wyoming,2020-07-23,7.0,19.0,108.0,17.0,-26.0,5.0
84077,1535174,United States,Wyoming,2020-07-24,-3.0,15.0,116.0,6.0,-28.0,5.0
84078,1535179,United States,Wyoming,2020-07-29,5.0,10.0,96.0,10.0,-27.0,5.0


# Apple Mobility Dataset (Fiona)

In [107]:
# https://wwcode-google-covid19-mobilityreports.s3.amazonaws.com/apple-mobility-trends-reports/dataset/apple-mobility-trends-reports.csv

input_bucket1 = 'wwcode-google-covid19-mobilityreports/apple-mobility-trends-reports/dataset'
in_data_key1 = 'apple-mobility-trends-reports.csv'
training_data_location1 = 's3://{}/{}'.format(input_bucket1, in_data_key1)

mobility_df3= pd.read_csv(training_data_location1)
print('uploaded training data from location: {}'.format(training_data_location1))

uploaded training data from location: s3://wwcode-google-covid19-mobilityreports/apple-mobility-trends-reports/dataset/apple-mobility-trends-reports.csv


In [108]:
mobility_df3

Unnamed: 0,geo_type,region,transportation_type,alternative_name,sub-region,country,2020-01-13,2020-01-14,2020-01-15,2020-01-16,...,2020-07-24,2020-07-25,2020-07-26,2020-07-27,2020-07-28,2020-07-29,2020-07-30,2020-07-31,2020-08-01,2020-08-02
0,country/region,Albania,driving,,,,100.0,95.30,101.43,97.20,...,190.83,206.45,202.15,178.29,168.26,169.05,176.53,197.01,233.20,229.20
1,country/region,Albania,walking,,,,100.0,100.68,98.93,98.46,...,133.39,137.95,116.08,134.41,131.60,125.50,121.19,124.26,156.25,137.10
2,country/region,Argentina,driving,,,,100.0,97.07,102.45,111.21,...,67.02,63.70,37.80,53.91,56.84,58.40,59.01,64.67,64.46,37.16
3,country/region,Argentina,walking,,,,100.0,95.11,101.37,112.67,...,49.10,46.25,32.78,41.44,44.63,46.32,46.18,48.13,48.92,31.31
4,country/region,Australia,driving,AU,,,100.0,102.98,104.21,108.63,...,98.50,75.84,83.95,87.17,91.46,92.73,98.44,101.88,88.20,87.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3620,county,York County,driving,,Virginia,United States,100.0,100.96,111.03,111.97,...,218.71,265.48,224.18,187.51,185.69,185.50,185.70,213.00,231.95,194.21
3621,county,York County,driving,,Maine,United States,100.0,103.05,103.98,82.24,...,300.87,336.71,311.78,264.63,251.26,262.84,266.27,294.46,344.35,297.13
3622,county,Young County,driving,,Texas,United States,100.0,98.85,97.76,92.21,...,209.96,178.46,158.54,160.50,158.06,159.96,167.34,193.50,194.99,150.47
3623,county,Yuba County,driving,,California,United States,100.0,102.98,103.10,114.08,...,185.12,186.12,163.26,157.79,156.33,159.02,167.03,181.93,183.92,172.46


In [113]:
mobility_df3['sub-region'].unique()

array([nan, 'North Rhine-Westphalia', 'South Australia', 'Ohio',
       'New York', 'New Mexico', 'Pennsylvania', 'North Holland',
       'Alaska', 'Michigan', 'Maryland', 'California', 'Antwerp Province',
       'Attica', 'Georgia', 'Auckland Region', 'Bavaria', 'Texas',
       'Catalonia', 'Canton of Basel-Stadt', 'Northern Ireland',
       'Minas Gerais', 'Lombardy Region', 'Canton of Bern', 'Alabama',
       'England', 'Idaho', 'Emilia-Romagna Region', 'Aquitaine Region',
       'Massachusetts', 'Bremen (state)', 'Connecticut', 'Queensland',
       'Alberta', 'Western Cape', 'Wales', 'Sicily Region',
       'North Carolina', 'Illinois', 'Colorado', 'South Carolina',
       'Capital Region of Denmark', 'Paraná', 'Iowa', 'Saxony',
       'Scotland', 'North Brabant', 'Oregon', 'Tuscany Region', 'Indiana',
       'Ceará', 'Hesse', 'Shizuoka Prefecture', 'Fukuoka Prefecture',
       'Pomerania Province', 'Canton of Geneva', 'Goiás',
       'Västra Götaland County', 'Styria', 'Rhône-Alpe

# Tableau 

In [None]:
#published/PUBLIC/COVID-19-Activity/1596595915/COVID-19 Activity.csv

bucket = 'wwcode-google-covid19-mobilityreports/google-covid-19-reports/dataset'
data_key = 'google-covid-19-reports.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df = pd.read_csv(data_location)
print('uploaded training data from location: {}'.format(data_location))