In [45]:
#dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.lines import Line2D

In [46]:
#reading in the state abbreviations
path = '../google_trends/statesAbbrev.csv'
statesAbbrev_df = pd.read_csv(path)
statesAbbrev_df.tail(12)

Unnamed: 0,State,Abbrev,Code
39,Rhode Island,R.I.,RI
40,South Carolina,S.C.,SC
41,South Dakota,S.D.,SD
42,Tennessee,Tenn.,TN
43,Texas,Tex.,TX
44,Utah,Utah,UT
45,Vermont,Vt.,VT
46,Virginia,Va.,VA
47,Washington,Wash.,WA
48,West Virginia,W.Va.,WV


In [47]:
#list for state codes
state_codes = [x for x in statesAbbrev_df['Code']]

In [48]:
#list for state names
state_names = [x for x in statesAbbrev_df['State']]

In [49]:
#creating a df dict to hold the state dfs, reading in the dfs
states_df_dict = {}
for x in (4, 50):
    path = f'../google_trends/google_trends_csvs/state_csvs/US-{state_codes[x]}_cat_data.csv'
    states_df_dict[f'{state_codes[x]}_data_df'] = [pd.read_csv(path)]

In [50]:
#inspecting the keys (ie df names)
states_df_dict.keys()

dict_keys(['CA_data_df', 'WY_data_df'])

In [51]:
#test printing a df
print(state_codes[4])
states_df_dict[f'{state_codes[4]}_data_df'][0].head(10)

CA


Unnamed: 0,time,arts_entertainment,autos,beauty_fitness,books_lit,action_adventure,campaigns_elections,celebs,discrimination,entertainment_media,...,mobiles,online_vids,scifi_fantasy,sport_news,tv_shows,voice_vid_chat,weather,covid_cases,stay_at_home,mass_gathering_ban
0,2019-01-06,98,82,95,80,70,0,71,24,86,...,53,84,56,87,92,64,66,,False,False
1,2019-01-13,96,81,89,82,72,0,66,27,78,...,52,86,56,74,88,59,82,,False,False
2,2019-01-20,88,78,90,79,67,0,58,29,74,...,50,83,54,71,87,51,37,,False,False
3,2019-01-27,93,78,89,82,65,0,63,32,70,...,52,82,55,74,85,67,75,,False,False
4,2019-02-03,89,79,88,81,61,0,60,25,67,...,49,81,51,73,85,62,88,,False,False
5,2019-02-10,96,78,84,84,59,0,62,31,71,...,49,83,58,63,87,60,100,,False,False
6,2019-02-17,96,83,88,79,66,0,78,30,73,...,54,79,54,65,85,67,72,,False,False
7,2019-02-24,99,82,86,90,61,0,70,35,69,...,53,80,53,68,92,63,66,,False,False
8,2019-03-03,100,86,92,88,63,0,58,77,71,...,51,80,55,76,100,73,76,,False,False
9,2019-03-10,94,83,90,82,60,0,62,29,70,...,49,80,52,90,90,64,46,,False,False


In [52]:
ca_df = states_df_dict[f'{state_codes[4]}_data_df'][0]

In [53]:
wy_df = states_df_dict[f'{state_codes[50]}_data_df'][0]

In [54]:
#creating a list of all the columns
col_names = list(ca_df.columns)

In [55]:
#using the col list to extract the categories
cat_names = [x for x in col_names if x not in ['time', 'covid_cases', 'stay_at_home', 'mass_gathering_ban']]

In [56]:
#extracting times from the df
times_list = list(ca_df['time'])

In [57]:
#making sure there is only 1 df w/in the dict per state (in case you need to clear any badly done extra dfs from the function below)
for x in (4, 50):
    for y in range(1, len(states_df_dict[f'{state_codes[x]}_data_df'])):
        del(states_df_dict[f'{state_codes[x]}_data_df'][y])
    print(state_codes[x])
    print(len(states_df_dict[f'{state_codes[x]}_data_df']))

CA
1
WY
1


In [58]:
#applying independent t-tests between each year's mean (2019, 2020),
#whether the p-val is signif, the 2019 and 2020 stdevs
# appending a new df
def means_comp(ind_no):
    all_data_df = states_df_dict[f'{state_codes[ind_no]}_data_df'][0]
    data_2019_df = all_data_df.iloc[:52, :]
    data_2020_df = all_data_df.iloc[52:, :]
    
    means_diff = []
    stdevs_2019 = []
    stdevs_2020 = []
    for col in cat_names:
        means_diff_instance = stats.ttest_ind(data_2019_df[col], data_2020_df[col], equal_var=False)[1]
        means_diff.append(means_diff_instance)
        stdev_2019 = data_2019_df[col].std(ddof=0)
        stdevs_2019.append(stdev_2019)
        stdev_2020 = data_2020_df[col].std(ddof=0)
        stdevs_2020.append(stdev_2020)
    means_diff[21:24] = 'n/a', 'n/a', 'n/a'
    stdevs_2019[21:24] = 'n/a', 'n/a', 'n/a'
    stdevs_2020[21:24] = 'n/a', 'n/a', 'n/a'
    
    signif_list = []
    for x in means_diff:
        if x == 'n/a':
            signif_list.append('n/a')
        elif x <= 0.05:
            signif_list.append('Yes')
        elif x > 0.05:
            signif_list.append('No')

    comparison_df = pd.DataFrame({'categories': data_2019_df.mean().index, '2019_means': data_2019_df.mean().values, '2020_means': data_2020_df.mean().values, 'p_vals': means_diff, 'stats_signif': signif_list, '2019_stdev': stdevs_2019, '2020_stdev': stdevs_2020})
    comparison_df.drop([21,22,23], inplace=True)
    comparison_df.set_index('categories', inplace=True)
    new_index = ['action_adventure', 'scifi_fantasy', 'tv_shows', 'online_vids', 'mobiles', 'voice_vid_chat', 'infectious_diseases', 'health_news', 'games_systems_consoles', 'arts_entertainment', 'books_lit', 'lottos', 'celebs', 'entertainment_media', 'campaigns_elections', 'discrimination', 'law_enf', 'weather', 'autos', 'beauty_fitness', 'sport_news']
    comparison_df = comparison_df.reindex(new_index)
    comparison_df['diff_btwn_means'] = comparison_df['2020_means'] - comparison_df['2019_means']
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(comparison_df)
    
    signif_cats =[]
    for x in range(0, len(comparison_df)):
        if comparison_df.iloc[x, 3] == 'Yes':
            signif_cats.append((list(comparison_df.index)[x], comparison_df.iloc[x, 2]))
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(signif_cats)

In [59]:
#applying the function to all the state dfs
for x in (4, 50):
    means_comp(x)

In [62]:
print(state_codes[4])
states_df_dict[f'{state_codes[4]}_data_df'][1]

CA


Unnamed: 0_level_0,2019_means,2020_means,p_vals,stats_signif,2019_stdev,2020_stdev,diff_btwn_means
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
action_adventure,66.384615,62.45098,0.00525988,Yes,7.86461,5.84208,-3.933635
scifi_fantasy,57.230769,61.215686,0.114452,No,14.6663,10.0826,3.984917
tv_shows,84.807692,81.078431,0.00157805,Yes,5.19629,6.26801,-3.729261
online_vids,80.384615,78.72549,0.250488,No,3.42027,9.5303,-1.659125
mobiles,52.826923,48.823529,0.0059075,Yes,8.8746,4.78029,-4.003394
voice_vid_chat,58.634615,51.784314,0.00480011,Yes,9.85867,13.6171,-6.850302
infectious_diseases,9.769231,25.980392,4.73493e-07,Yes,1.01177,19.8084,16.211161
health_news,55.903846,64.372549,7.43562e-05,Yes,5.95257,12.9402,8.468703
games_systems_consoles,51.423077,62.647059,2.14724e-06,Yes,5.30365,14.2812,11.223982
arts_entertainment,91.057692,84.803922,1.44875e-08,Yes,3.35934,6.13578,-6.253771


In [63]:
print(state_codes[50])
states_df_dict[f'{state_codes[50]}_data_df'][1]

WY


Unnamed: 0_level_0,2019_means,2020_means,p_vals,stats_signif,2019_stdev,2020_stdev,diff_btwn_means
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
action_adventure,58.538462,58.333333,0.92855,No,11.9268,10.9949,-0.205128
scifi_fantasy,45.365385,47.529412,0.552689,No,17.043,19.3627,2.164027
tv_shows,81.25,75.588235,0.0035644,Yes,10.0381,9.00583,-5.661765
online_vids,78.615385,73.941176,0.0111616,Yes,8.28612,9.79478,-4.674208
mobiles,53.730769,51.078431,0.22755,No,12.2162,9.60156,-2.652338
voice_vid_chat,26.519231,30.196078,0.346462,No,16.5314,22.0668,3.676848
infectious_diseases,9.173077,24.72549,1.91166e-05,Yes,2.37553,23.2076,15.552413
health_news,30.615385,34.117647,0.471215,No,22.8365,25.7116,3.502262
games_systems_consoles,46.326923,53.843137,0.00981761,Yes,12.9447,15.582,7.516214
arts_entertainment,88.25,84.490196,0.000299478,Yes,4.58205,5.44625,-3.759804


In [37]:
#printing all the 'signif' cats for each state
for x in (4, 50):
    print(state_codes[x])
    print(states_df_dict[f'{state_codes[x]}_data_df'][2])

CA
[('action_adventure', 0.005259883307413347), ('tv_shows', 0.0015780476362960306), ('mobiles', 0.005907503423500433), ('voice_vid_chat', 0.004800109860980946), ('infectious_diseases', 4.7349311840776417e-07), ('health_news', 7.435622542078884e-05), ('games_systems_consoles', 2.147243137223962e-06), ('arts_entertainment', 1.4487455345945572e-08), ('books_lit', 0.001268119205973526), ('lottos', 1.7210942047083445e-10), ('celebs', 0.01591889717263027), ('discrimination', 0.008122600787532191), ('weather', 0.0018921408919584373), ('autos', 0.0036665181994584442), ('beauty_fitness', 0.0019426154589504772), ('sport_news', 1.9568210454165457e-13)]
WY
[('tv_shows', 0.003564403911922712), ('online_vids', 0.011161572055350935), ('infectious_diseases', 1.9116629466131967e-05), ('games_systems_consoles', 0.009817608796413819), ('arts_entertainment', 0.000299478163159776), ('books_lit', 0.009342488939380368), ('lottos', 0.035131713792526664), ('celebs', 0.008825692307801296), ('discrimination', 0

#### Guide to indices w/in states_df_dict:

index 0 is a df containing all info on a state (the categs info, the covid cases, the stay at home orders/gathering bans)

index 1 is a df showing the average pop rating for each category in 2019 and 2020, and the p value of an indep t-test for each categ, and whether the p-val is signif

index 2 is a list of all the cats w/ signif changes in cat pop, and their p-vals