In [109]:
#dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.lines import Line2D

In [110]:
#reading in the state abbreviations
path = '../google_trends/statesAbbrev.csv'
statesAbbrev_df = pd.read_csv(path)
statesAbbrev_df.tail(12)

Unnamed: 0,State,Abbrev,Code
39,Rhode Island,R.I.,RI
40,South Carolina,S.C.,SC
41,South Dakota,S.D.,SD
42,Tennessee,Tenn.,TN
43,Texas,Tex.,TX
44,Utah,Utah,UT
45,Vermont,Vt.,VT
46,Virginia,Va.,VA
47,Washington,Wash.,WA
48,West Virginia,W.Va.,WV


In [111]:
#list for state codes
state_codes = [x for x in statesAbbrev_df['Code']]

In [112]:
#list for state names
state_names = [x for x in statesAbbrev_df['State']]

In [113]:
#creating a df dict to hold the state dfs, reading in the dfs
states_df_dict = {}
for x in range(40, 51):
    path = f'../google_trends/google_trends_csvs/state_csvs/US-{state_codes[x]}_cat_data.csv'
    states_df_dict[f'{state_codes[x]}_data_df'] = [pd.read_csv(path)]

In [114]:
#inspecting the keys (ie df names)
states_df_dict.keys()

dict_keys(['SC_data_df', 'SD_data_df', 'TN_data_df', 'TX_data_df', 'UT_data_df', 'VT_data_df', 'VA_data_df', 'WA_data_df', 'WV_data_df', 'WI_data_df', 'WY_data_df'])

In [115]:
#test printing a df
print(state_codes[40])
states_df_dict[f'{state_codes[40]}_data_df'][0].head(10)

SC


Unnamed: 0,time,arts_entertainment,autos,beauty_fitness,books_lit,action_adventure,campaigns_elections,celebs,discrimination,entertainment_media,...,mobiles,online_vids,scifi_fantasy,sport_news,tv_shows,voice_vid_chat,weather,covid_cases,stay_at_home,mass_gathering_ban
0,2019-01-06,96,78,70,87,68,0,65,22,68,...,60,92,66,80,91,37,59,,False,False
1,2019-01-13,95,76,73,87,74,0,66,10,83,...,61,87,54,48,79,35,68,,False,False
2,2019-01-20,88,80,71,81,69,0,61,18,73,...,62,83,55,54,89,41,68,,False,False
3,2019-01-27,93,76,70,83,71,0,66,17,80,...,61,94,43,44,83,48,69,,False,False
4,2019-02-03,92,78,72,88,65,0,61,26,60,...,61,81,42,73,89,58,57,,False,False
5,2019-02-10,95,80,68,92,58,0,63,20,72,...,64,91,51,47,78,43,70,,False,False
6,2019-02-17,95,82,76,90,66,0,78,23,67,...,67,91,40,40,83,53,80,,False,False
7,2019-02-24,100,84,73,94,64,0,69,25,68,...,64,87,55,45,86,70,74,,False,False
8,2019-03-03,97,82,72,93,63,0,59,46,75,...,70,89,51,39,91,52,76,,False,False
9,2019-03-10,93,83,68,86,55,0,68,22,66,...,64,84,46,63,84,55,58,,False,False


In [116]:
#creating a list of all the columns
col_names = list(states_df_dict[f'{state_codes[40]}_data_df'][0].columns)

In [117]:
#using the col list to extract the categories
cat_names = [x for x in col_names if x not in ['time', 'covid_cases', 'stay_at_home', 'mass_gathering_ban']]

In [118]:
#extracting times from the df
times_list = list(states_df_dict[f'{state_codes[40]}_data_df'][0]['time'])

In [119]:
#making sure there is only 1 df w/in the dict per state (in case you need to clear any badly done extra dfs from the function below)
for x in range(40, 51):
    for y in range(1, len(states_df_dict[f'{state_codes[x]}_data_df'])):
        del(states_df_dict[f'{state_codes[x]}_data_df'][y])
    print(state_codes[x])
    print(len(states_df_dict[f'{state_codes[x]}_data_df']))

SC
1
SD
1
TN
1
TX
1
UT
1
VT
1
VA
1
WA
1
WV
1
WI
1
WY
1


In [120]:
#applying independent t-tests between each year's mean (2019, 2020),
#whether the p-val is signif, the 2019 and 2020 stdevs
# appending a new df
def means_comp(ind_no):
    all_data_df = states_df_dict[f'{state_codes[ind_no]}_data_df'][0]
    data_2019_df = all_data_df.iloc[:52, :]
    data_2020_df = all_data_df.iloc[52:, :]
    
    means_diff = []
    stdevs_2019 = []
    stdevs_2020 = []
    for col in cat_names:
        means_diff_instance = stats.ttest_ind(data_2019_df[col], data_2020_df[col], equal_var=False)[1]
        means_diff.append(means_diff_instance)
        stdev_2019 = data_2019_df[col].std(ddof=0)
        stdevs_2019.append(stdev_2019)
        stdev_2020 = data_2020_df[col].std(ddof=0)
        stdevs_2020.append(stdev_2020)
    means_diff[21:24] = 'n/a', 'n/a', 'n/a'
    stdevs_2019[21:24] = 'n/a', 'n/a', 'n/a'
    stdevs_2020[21:24] = 'n/a', 'n/a', 'n/a'
    
    signif_list = []
    for x in means_diff:
        if x == 'n/a':
            signif_list.append('n/a')
        elif x <= 0.05:
            signif_list.append('Yes')
        elif x > 0.05:
            signif_list.append('No')

    comparison_df = pd.DataFrame({'categories': data_2019_df.mean().index, '2019_means': data_2019_df.mean().values, '2020_means': data_2020_df.mean().values, 'p_vals': means_diff, 'stats_signif': signif_list, '2019_stdev': stdevs_2019, '2020_stdev': stdevs_2020})
    comparison_df.drop([21,22,23], inplace=True)
    comparison_df.set_index('categories', inplace=True)
    new_index = ['action_adventure', 'scifi_fantasy', 'tv_shows', 'online_vids', 'mobiles', 'voice_vid_chat', 'infectious_diseases', 'health_news', 'games_systems_consoles', 'arts_entertainment', 'books_lit', 'lottos', 'celebs', 'entertainment_media', 'campaigns_elections', 'discrimination', 'law_enf', 'weather', 'autos', 'beauty_fitness', 'sport_news']
    comparison_df = comparison_df.reindex(new_index)
    comparison_df['diff_btwn_2019_2020_stdevs'] = comparison_df['2020_stdev'] - comparison_df['2019_stdev']
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(comparison_df)
    
    signif_cats =[]
    for x in range(0, len(comparison_df)):
        if comparison_df.iloc[x, 3] == 'Yes':
            signif_cats.append((list(comparison_df.index)[x], comparison_df.iloc[x, 2]))
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(signif_cats)

In [121]:
#applying the function to all the state dfs
for x in range(40, 51):
    means_comp(x)

In [122]:
states_df_dict[f'{state_codes[47]}_data_df'][1]

Unnamed: 0_level_0,2019_means,2020_means,p_vals,stats_signif,2019_stdev,2020_stdev,diff_btwn_2019_2020_stdevs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
action_adventure,70.403846,69.176471,0.355167,No,6.11826,7.11173,0.993473
scifi_fantasy,52.576923,57.352941,0.0705694,No,15.847,9.68893,-6.15811
tv_shows,83.942308,81.745098,0.0609781,No,5.98849,5.66324,-0.325254
online_vids,74.365385,74.666667,0.802566,No,3.63227,7.68923,4.05695
mobiles,60.692308,55.784314,0.000584717,Yes,8.19366,5.36625,-2.82741
voice_vid_chat,53.057692,48.686275,0.100608,No,10.7836,15.2695,4.4859
infectious_diseases,10.057692,26.0,2.17457e-06,Yes,1.89547,21.0182,19.1227
health_news,54.634615,58.647059,0.126503,No,10.9913,14.8361,3.84482
games_systems_consoles,54.211538,63.470588,4.10685e-05,Yes,6.74062,13.4344,6.69382
arts_entertainment,93.096154,87.784314,1.50535e-09,Yes,3.31813,4.52147,1.20334


In [106]:
#printing the 4 lowest p vals for each state
for x in range(40, 51):
    print(state_codes[x])
    print(states_df_dict[f'{state_codes[x]}_data_df'][1].sort_values(by=['p_vals']).iloc[0:4, :])

SC
                        2019_means  2020_means       p_vals stats_signif  \
categories                                                                 
sport_news               57.038462   41.254902  1.20608e-10          Yes   
online_vids              86.750000   78.235294  8.08299e-10          Yes   
games_systems_consoles   48.730769   60.156863  2.25364e-06          Yes   
lottos                   65.230769   57.627451  5.16424e-06          Yes   

                       2019_stdev 2020_stdev  
categories                                    
sport_news                10.6228    11.4238  
online_vids                4.4716    7.44282  
games_systems_consoles    6.61035     14.249  
lottos                    9.01839    6.61834  
SD
                     2019_means  2020_means       p_vals stats_signif  \
categories                                                              
infectious_diseases    9.653846   24.274510  3.68499e-06          Yes   
sport_news            53.250000   38

In [107]:
#printing all the 'signif' cats for each state
for x in range(40, 51):
    print(state_codes[x])
    print(states_df_dict[f'{state_codes[x]}_data_df'][2])

SC
[('action_adventure', 0.037213368887122865), ('scifi_fantasy', 0.020561479597784815), ('tv_shows', 0.001723158935242587), ('online_vids', 8.082991587792804e-10), ('infectious_diseases', 8.313448149762176e-06), ('health_news', 8.172355066885259e-05), ('games_systems_consoles', 2.2536372559833473e-06), ('arts_entertainment', 1.1386869137802196e-05), ('books_lit', 0.0036629705074250007), ('lottos', 5.164235097920391e-06), ('celebs', 2.0464965258284322e-05), ('entertainment_media', 0.0409192335375911), ('discrimination', 0.0006658984716343508), ('sport_news', 1.2060797821829332e-10)]
SD
[('action_adventure', 0.02517127820578754), ('scifi_fantasy', 0.00842259325095429), ('tv_shows', 0.012613457326598672), ('online_vids', 0.027560319692913875), ('infectious_diseases', 3.6849899850973405e-06), ('arts_entertainment', 0.0026597091290213636), ('books_lit', 0.0005535286715042026), ('lottos', 0.0005213150936593188), ('celebs', 0.0007263319652055333), ('discrimination', 0.01967980766116836), ('w

In [108]:
#number of times each category is statistically important for the 11 dfs
#ones that are stats signif all times suggest they are categories affected by the pandemic
#ones that are signif only for a few states could suggest general differences in states
stats_count_list = []
for x in range(40, 51):
    for m in states_df_dict[f'{state_codes[x]}_data_df'][1].index:
        if states_df_dict[f'{state_codes[x]}_data_df'][1].loc[m, 'stats_signif'] == 'Yes':
            stats_count_list.append(m)
pd.Series(stats_count_list).value_counts()

books_lit                 11
lottos                    11
celebs                    11
infectious_diseases       11
sport_news                11
discrimination            11
arts_entertainment        10
games_systems_consoles    10
online_vids                9
scifi_fantasy              6
mobiles                    6
tv_shows                   6
health_news                6
autos                      5
beauty_fitness             4
weather                    4
action_adventure           3
voice_vid_chat             2
entertainment_media        2
dtype: int64

#### Guide to indices w/in states_df_dict:

index 0 is a df containing all info on a state (the categs info, the covid cases, the stay at home orders/gathering bans)

index 1 is a df showing the average pop rating for each category in 2019 and 2020, and the p value of an indep t-test for each categ, and whether the p-val is signif

index 2 is a list of all the cats w/ signif changes in cat pop, and their p-vals