In [1]:

# https://github.com/cmu-delphi/epidatpy/blob/dev/docs/index.rst#api-keys

from epidatpy import EpiDataContext, EpiRange

# Create the client object.
epidata = EpiDataContext()
apicall = epidata.pub_covidcast(
    data_source="jhu-csse",
    signals="confirmed_cumulative_num",
    geo_type="nation",
    time_type="day",
    geo_values="us",
    time_values=EpiRange(20210405, 20210410),
)
print(apicall.df())




     source                    signal geo_type geo_value time_type time_value  \
0  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-05   
1  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-06   
2  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-07   
3  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-08   
4  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-09   
5  jhu-csse  confirmed_cumulative_num   nation        us       day 2021-04-10   

       issue  lag       value  stderr  sample_size  direction  missing_value  \
0 2023-03-10  704  30874278.0    <NA>         <NA>       <NA>              0   
1 2023-03-10  703  30937664.0    <NA>         <NA>       <NA>              0   
2 2023-03-10  702  31013399.0    <NA>         <NA>       <NA>              0   
3 2023-03-10  701  31093208.0    <NA>         <NA>       <NA>              0   
4 2023-03-10  700  31177602.0   

In [2]:
from epidatpy import CovidcastEpidata, EpiDataContext, EpiRange

# Create the client object. Note that due to the arguments below all results
# will be cached to your disk for 7 days, which helps avoid making repeated
# downloads.
epidata = EpiDataContext(use_cache=True, cache_max_age_days=7)

# `pub_covidcast` returns an `EpiDataCall`, which is a not-yet-executed query
# that can be inspected.
apicall = epidata.pub_covidcast(
    data_source="jhu-csse",
    signals="confirmed_cumulative_num",
    geo_type="nation",
    time_type="day",
    geo_values="us",
    time_values=EpiRange(20210405, 20210410),
)
print(apicall)
apicall.df()

EpiDataCall(endpoint=covidcast/, params={'data_source': 'jhu-csse', 'signals': 'confirmed_cumulative_num', 'geo_type': 'nation', 'time_type': 'day', 'geo_values': 'us', 'time_values': '20210405-20210410'})




Unnamed: 0,source,signal,geo_type,geo_value,time_type,time_value,issue,lag,value,stderr,sample_size,direction,missing_value,missing_stderr,missing_sample_size
0,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-05,2023-03-10,704,30874278.0,,,,0,5,5
1,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-06,2023-03-10,703,30937664.0,,,,0,5,5
2,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-07,2023-03-10,702,31013399.0,,,,0,5,5
3,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-08,2023-03-10,701,31093208.0,,,,0,5,5
4,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-09,2023-03-10,700,31177602.0,,,,0,5,5
5,jhu-csse,confirmed_cumulative_num,nation,us,day,2021-04-10,2023-03-10,699,31247947.0,,,,0,5,5


In [3]:
indicators = [
    "smoothed_wwearing_mask_7d", ##mask
    "smoothed_wworried_catch_covid", ##worried_catch_covid
    "smoothed_wbelief_masking_effective", #belief_masking_effective

    "smoothed_wreceived_news_local_health", #received_news_local_health
    "smoothed_wreceived_news_experts",
    "smoothed_wreceived_news_cdc",# non è WHO
    "smoothed_wreceived_news_govt_health",
    "smoothed_wreceived_news_politicians",
    "smoothed_wreceived_news_journalists", #received_news_journalists
    "smoothed_wreceived_news_friends",

    "smoothed_wtrust_covid_info_doctors",
    "smoothed_wtrust_covid_info_experts",
    "smoothed_wtrust_covid_info_cdc", #non è who è cdc !!!!! OCCHIOOOOO
    "smoothed_wtrust_covid_info_govt_health",
    "smoothed_wtrust_covid_info_politicians",
    "smoothed_wtrust_covid_info_journalists",
    "smoothed_wtrust_covid_info_friends"
]

In [4]:
for indicator in indicators:
    df = epidata.pub_covidcast(
    data_source="fb-survey",
    signals= indicator,
    geo_type="state",
    time_type="day",
    geo_values="*",
    time_values=EpiRange(20210521, 20220625),
    ).df()
    df = df[['signal', 'geo_value', 'time_value', 'value', 'sample_size']]
    df_sorted = df.sort_values(by=['geo_value', 'time_value'])
    filename = f"Dataus/{indicator}.csv"
    df_sorted['value']=df_sorted['value']/100
    df_sorted.to_csv(filename, index=False)



In [5]:
from functools import reduce
import pandas as pd
df_list = []

for indicator in indicators:
    filename = f"Dataus/{indicator}.csv"
    df = pd.read_csv(filename)
    signal = df['signal'].iloc[0]
    df = df[['geo_value', 'time_value', 'value', 'sample_size']].copy()
    df.rename(columns={
        'value': signal,
        'sample_size': f'sample_size_{signal}'
    }, inplace=True)

    df_list.append(df)

# unisce tutto su geo_value e time_value
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['geo_value', 'time_value'], how='outer'), df_list)
df_merged.sort_values(by=['geo_value', 'time_value'], inplace=True)

us_state_abbrev_lower = { #metti i nomi completi agli satti USA
    'al': 'Alabama',
    'ak': 'Alaska',
    'az': 'Arizona',
    'ar': 'Arkansas',
    'ca': 'California',
    'co': 'Colorado',
    'ct': 'Connecticut',
    'dc': 'District of Columbia',
    'de': 'Delaware',
    'fl': 'Florida',
    'ga': 'Georgia',
    'hi': 'Hawaii',
    'id': 'Idaho',
    'il': 'Illinois',
    'in': 'Indiana',
    'ia': 'Iowa',
    'ks': 'Kansas',
    'ky': 'Kentucky',
    'la': 'Louisiana',
    'me': 'Maine',
    'md': 'Maryland',
    'ma': 'Massachusetts',
    'mi': 'Michigan',
    'mn': 'Minnesota',
    'ms': 'Mississippi',
    'mo': 'Missouri',
    'mt': 'Montana',
    'ne': 'Nebraska',
    'nv': 'Nevada',
    'nh': 'New Hampshire',
    'nj': 'New Jersey',
    'nm': 'New Mexico',
    'ny': 'New York',
    'nc': 'North Carolina',
    'nd': 'North Dakota',
    'oh': 'Ohio',
    'ok': 'Oklahoma',
    'or': 'Oregon',
    'pa': 'Pennsylvania',
    'ri': 'Rhode Island',
    'sc': 'South Carolina',
    'sd': 'South Dakota',
    'tn': 'Tennessee',
    'tx': 'Texas',
    'ut': 'Utah',
    'vt': 'Vermont',
    'va': 'Virginia',
    'wa': 'Washington',
    'wv': 'West Virginia',
    'wi': 'Wisconsin',
    'wy': 'Wyoming'
}

df_merged['geo_value'] = df_merged['geo_value'].map(us_state_abbrev_lower).fillna(df_merged['geo_value'])

df_merged.to_csv("Dataus/datiusa_unificati.csv", index=False) #conteine tutti i dati usa assieme
num_unici = df_merged['geo_value'].nunique()
print(f"Numero di valori unici in 'geo_value': {num_unici}")

valori_unici = df_merged['geo_value'].unique()
print("Valori unici in 'geo_value':")
print(valori_unici)


Numero di valori unici in 'geo_value': 51
Valori unici in 'geo_value':
['Alaska' 'Alabama' 'Arkansas' 'Arizona' 'California' 'Colorado'
 'Connecticut' 'District of Columbia' 'Delaware' 'Florida' 'Georgia'
 'Hawaii' 'Iowa' 'Idaho' 'Illinois' 'Indiana' 'Kansas' 'Kentucky'
 'Louisiana' 'Massachusetts' 'Maryland' 'Maine' 'Michigan' 'Minnesota'
 'Missouri' 'Mississippi' 'Montana' 'North Carolina' 'North Dakota'
 'Nebraska' 'New Hampshire' 'New Jersey' 'New Mexico' 'Nevada' 'New York'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Virginia' 'Vermont'
 'Washington' 'Wisconsin' 'West Virginia' 'Wyoming']


## Temporal coverage and continuity


In [6]:
paesimeno300 = set()
for col in indicators:
    counts = df_merged.groupby('geo_value')[col].count()
    paesimeno300.update(counts[counts < 300].index)

print("Paesi che hanno meno di 300 righe in almeno un indicaore:")
print(sorted(paesimeno300))

Paesi che hanno meno di 300 righe in almeno un indicaore:
['District of Columbia']


In [7]:
df_merged['time_value'] = pd.to_datetime(df_merged['time_value'])
paesibuchi7giorni = set()

for col in indicators:
    for stato, gruppo in df_merged[['geo_value', 'time_value', col]].groupby('geo_value'):
        gruppo = gruppo.sort_values('time_value') #per essere sicuri che le date sono in sequenza temporale
        date_valide = gruppo.loc[gruppo[col].notna(), 'time_value'] #prendo data dove il valore non è NAN 
        differenze = date_valide.diff().dt.days #calcolo differenza tra data valida prima e questa di ora
        if (differenze > 7).any():
            paesibuchi7giorni.add(stato) #ci metto dentro i paesi con alemtno 7 giorni di buco

print("Paesi con buchi > 7 giorni:")
print(sorted(paesibuchi7giorni))

Paesi con buchi > 7 giorni:
['District of Columbia', 'Wyoming']


In [8]:
paesidascartare= list(paesimeno300.union(paesibuchi7giorni))
paesidascartare

df_merged2 = df_merged[~df_merged['geo_value'].isin(paesidascartare)].copy()

df_merged2.to_csv("Dataus/dati1.csv", index=False)


## unire USA ai dati del Mondo

In [9]:
import pandas as pd
df1 = pd.read_csv('csv/FULL_DATA1.csv')
df2 = pd.read_csv('Dataus/dati1.csv')
print("Colonne csv1:", df1.columns.tolist())
print("Colonne csv2:", df2.columns.tolist())

Colonne csv1: ['country', 'survey_date', 'sample_size_mask', 'pct_mask', 'sample_size_worried_catch_covid', 'pct_worried_catch_covid', 'sample_size_belief_masking_effective', 'pct_belief_masking_effective', 'sample_size_received_news_local_health', 'pct_received_news_local_health', 'sample_size_received_news_experts', 'pct_received_news_experts', 'sample_size_received_news_who', 'pct_received_news_who', 'sample_size_received_news_govt_health', 'pct_received_news_govt_health', 'sample_size_received_news_politicians', 'pct_received_news_politicians', 'sample_size_received_news_journalists', 'pct_received_news_journalists', 'sample_size_received_news_friends', 'pct_received_news_friends', 'sample_size_trust_covid_info_local_health', 'pct_trust_covid_info_local_health', 'sample_size_trust_covid_info_experts', 'pct_trust_covid_info_experts', 'sample_size_trust_covid_info_who', 'pct_trust_covid_info_who', 'sample_size_trust_covid_info_govt_health', 'pct_trust_covid_info_govt_health', 'sample

In [10]:

# per far corrispondere le colonne
rename_map = {
    'geo_value': 'country',
    'time_value': 'survey_date',

    'sample_size_smoothed_wwearing_mask_7d': 'sample_size_mask',
    'smoothed_wwearing_mask_7d': 'pct_mask',

    'sample_size_smoothed_wworried_catch_covid': 'sample_size_worried_catch_covid',
    'smoothed_wworried_catch_covid': 'pct_worried_catch_covid',
    'sample_size_smoothed_wbelief_masking_effective': 'sample_size_belief_masking_effective',
    'smoothed_wbelief_masking_effective': 'pct_belief_masking_effective',

    'sample_size_smoothed_wreceived_news_local_health': 'sample_size_received_news_local_health',
    'smoothed_wreceived_news_local_health': 'pct_received_news_local_health',
    'sample_size_smoothed_wreceived_news_experts': 'sample_size_received_news_experts',
    'smoothed_wreceived_news_experts': 'pct_received_news_experts',
    'sample_size_smoothed_wreceived_news_cdc': 'sample_size_received_news_who',
    'smoothed_wreceived_news_cdc': 'pct_received_news_who',
    'sample_size_smoothed_wreceived_news_govt_health': 'sample_size_received_news_govt_health',
    'smoothed_wreceived_news_govt_health': 'pct_received_news_govt_health',
    'sample_size_smoothed_wreceived_news_politicians': 'sample_size_received_news_politicians',
    'smoothed_wreceived_news_politicians': 'pct_received_news_politicians',
    'sample_size_smoothed_wreceived_news_journalists': 'sample_size_received_news_journalists',
    'smoothed_wreceived_news_journalists': 'pct_received_news_journalists',
    'sample_size_smoothed_wreceived_news_friends': 'sample_size_received_news_friends',
    'smoothed_wreceived_news_friends': 'pct_received_news_friends',

    'sample_size_smoothed_wtrust_covid_info_doctors': 'sample_size_trust_covid_info_local_health',
    'smoothed_wtrust_covid_info_doctors': 'pct_trust_covid_info_local_health',
    'sample_size_smoothed_wtrust_covid_info_experts': 'sample_size_trust_covid_info_experts',
    'smoothed_wtrust_covid_info_experts': 'pct_trust_covid_info_experts',  
    'sample_size_smoothed_wtrust_covid_info_cdc': 'sample_size_trust_covid_info_who',
    'smoothed_wtrust_covid_info_cdc': 'pct_trust_covid_info_who', 
    'sample_size_smoothed_wtrust_covid_info_govt_health': 'sample_size_trust_covid_info_govt_health',
    'smoothed_wtrust_covid_info_govt_health': 'pct_trust_covid_info_govt_health',
    'sample_size_smoothed_wtrust_covid_info_politicians': 'sample_size_trust_covid_info_politicians',
    'smoothed_wtrust_covid_info_politicians': 'pct_trust_covid_info_politicians',
    'sample_size_smoothed_wtrust_covid_info_journalists': 'sample_size_trust_covid_info_journalists',
    'smoothed_wtrust_covid_info_journalists': 'pct_trust_covid_info_journalists',
    'sample_size_smoothed_wtrust_covid_info_friends': 'sample_size_trust_covid_info_friends',
    'smoothed_wtrust_covid_info_friends': 'pct_trust_covid_info_friends',
}

df2_renamed = df2.rename(columns=rename_map)  # rinomina colonne csv2
df2_aligned = df2_renamed[df1.columns]
df_finale = pd.concat([df1, df2_aligned], ignore_index=True) #unisci i due df
df_finale
df_finale.to_csv("csv/usa_mondo.csv", index=False)


print(df_finale['country'].unique())

['Argentina' 'Australia' 'Austria' 'Belgium' 'Brazil' 'Canada' 'Chile'
 'Colombia' 'Czech Republic' 'Denmark' 'Ecuador' 'Egypt' 'Finland'
 'France' 'Germany' 'Greece' 'Hungary' 'India' 'Indonesia' 'Italy' 'Japan'
 'Malaysia' 'Mexico' 'Netherlands' 'New Zealand' 'Norway' 'Peru'
 'Philippines' 'Poland' 'Portugal' 'Romania' 'Spain' 'Sweden'
 'Switzerland' 'Taiwan' 'Thailand' 'Turkey' 'Ukraine' 'United Kingdom'
 'Venezuela' 'Vietnam' 'Alaska' 'Alabama' 'Arkansas' 'Arizona'
 'California' 'Colorado' 'Connecticut' 'Delaware' 'Florida' 'Georgia'
 'Hawaii' 'Iowa' 'Idaho' 'Illinois' 'Indiana' 'Kansas' 'Kentucky'
 'Louisiana' 'Massachusetts' 'Maryland' 'Maine' 'Michigan' 'Minnesota'
 'Missouri' 'Mississippi' 'Montana' 'North Carolina' 'North Dakota'
 'Nebraska' 'New Hampshire' 'New Jersey' 'New Mexico' 'Nevada' 'New York'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Virginia' 'Vermont'
 'Washington' 'Wisconsin' 'West Virgin