In [1]:
#dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.lines import Line2D

In [2]:
#reading in the state abbreviations
path = '../google_trends/statesAbbrev.csv'
statesAbbrev_df = pd.read_csv(path)
statesAbbrev_df.tail(12)

Unnamed: 0,State,Abbrev,Code
39,Rhode Island,R.I.,RI
40,South Carolina,S.C.,SC
41,South Dakota,S.D.,SD
42,Tennessee,Tenn.,TN
43,Texas,Tex.,TX
44,Utah,Utah,UT
45,Vermont,Vt.,VT
46,Virginia,Va.,VA
47,Washington,Wash.,WA
48,West Virginia,W.Va.,WV


In [3]:
#list for state codes
state_codes = [x for x in statesAbbrev_df['Code']]

In [4]:
#list for state names
state_names = [x for x in statesAbbrev_df['State']]

In [5]:
#creating a df dict to hold the state dfs, reading in the dfs
states_df_dict = {}
for x in range(40, 51):
    path = f'../google_trends/google_trends_csvs/state_csvs/US-{state_codes[x]}_cat_data.csv'
    states_df_dict[f'{state_codes[x]}_data_df'] = [pd.read_csv(path)]

In [6]:
#inspecting the keys (ie df names)
states_df_dict.keys()

dict_keys(['SC_data_df', 'SD_data_df', 'TN_data_df', 'TX_data_df', 'UT_data_df', 'VT_data_df', 'VA_data_df', 'WA_data_df', 'WV_data_df', 'WI_data_df', 'WY_data_df'])

In [7]:
#test printing a df
print(state_codes[40])
states_df_dict[f'{state_codes[40]}_data_df'][0].head(10)

SC


Unnamed: 0,time,arts_entertainment,autos,beauty_fitness,books_lit,action_adventure,campaigns_elections,celebs,discrimination,entertainment_media,...,mobiles,online_vids,scifi_fantasy,sport_news,tv_shows,voice_vid_chat,weather,covid_cases,stay_at_home,mass_gathering_ban
0,2019-01-06,96,78,70,87,68,0,65,22,68,...,60,92,66,80,91,37,59,,False,False
1,2019-01-13,95,76,73,87,74,0,66,10,83,...,61,87,54,48,79,35,68,,False,False
2,2019-01-20,88,80,71,81,69,0,61,18,73,...,62,83,55,54,89,41,68,,False,False
3,2019-01-27,93,76,70,83,71,0,66,17,80,...,61,94,43,44,83,48,69,,False,False
4,2019-02-03,92,78,72,88,65,0,61,26,60,...,61,81,42,73,89,58,57,,False,False
5,2019-02-10,95,80,68,92,58,0,63,20,72,...,64,91,51,47,78,43,70,,False,False
6,2019-02-17,95,82,76,90,66,0,78,23,67,...,67,91,40,40,83,53,80,,False,False
7,2019-02-24,100,84,73,94,64,0,69,25,68,...,64,87,55,45,86,70,74,,False,False
8,2019-03-03,97,82,72,93,63,0,59,46,75,...,70,89,51,39,91,52,76,,False,False
9,2019-03-10,93,83,68,86,55,0,68,22,66,...,64,84,46,63,84,55,58,,False,False


In [8]:
col_names = list(states_df_dict[f'{state_codes[40]}_data_df'][0].columns)

In [9]:
col_names_for_max = [x for x in col_names if x not in ['time', 'covid_cases', 'stay_at_home', 'mass_gathering_ban']]

In [10]:
times_list = list(states_df_dict[f'{state_codes[40]}_data_df'][0]['time'])

In [11]:
for x in range(40, 51):
    for y in range(1, len(states_df_dict[f'{state_codes[x]}_data_df'])):
        del(states_df_dict[f'{state_codes[x]}_data_df'][y])
    print(state_codes[x])
    print(len(states_df_dict[f'{state_codes[x]}_data_df']))

SC
1
SD
1
TN
1
TX
1
UT
1
VT
1
VA
1
WA
1
WV
1
WI
1
WY
1


In [12]:
def means_comp(ind_no):
    all_data_df = states_df_dict[f'{state_codes[ind_no]}_data_df'][0]
    data_2019_df = all_data_df.iloc[:52, :]
    data_2020_df = all_data_df.iloc[52:, :]
    
    means_diff = []
    for col in col_names_for_max:
        means_diff_instance = stats.ttest_ind(data_2019_df[col], data_2020_df[col], equal_var=False)[1]
        means_diff.append(means_diff_instance)
    means_diff[21:24] = 'n/a', 'n/a', 'n/a'
    
    signif_list = []
    for x in means_diff:
        if x == 'n/a':
            signif_list.append('n/a')
        elif x <= 0.05:
            signif_list.append('Yes')
        elif x > 0.05:
            signif_list.append('No')

    comparison_df = pd.DataFrame({'categories': data_2019_df.mean().index, '2019_means': data_2019_df.mean().values, '2020_means': data_2020_df.mean().values, 'p_vals': means_diff, 'stats_signif': signif_list})
    comparison_df.drop([21,22,23], inplace=True)
    comparison_df.set_index('categories', inplace=True)
    new_index = ['action_adventure', 'scifi_fantasy', 'tv_shows', 'online_vids', 'mobiles', 'voice_vid_chat', 'infectious_diseases', 'health_news', 'games_systems_consoles', 'arts_entertainment', 'books_lit', 'lottos', 'celebs', 'entertainment_media', 'campaigns_elections', 'discrimination', 'law_enf', 'weather', 'autos', 'beauty_fitness', 'sport_news']
    comparison_df = comparison_df.reindex(new_index)
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(comparison_df)
    
    signif_cats =[]
    for x in range(0, len(comparison_df)):
        if comparison_df.iloc[x, 3] == 'Yes':
            signif_cats.append((list(comparison_df.index)[x], comparison_df.iloc[x, 2]))
    states_df_dict[f'{state_codes[ind_no]}_data_df'].append(signif_cats)

In [13]:
for x in range(40, 51):
    means_comp(x)

In [14]:
print(state_codes[49])
means_meds_df = states_df_dict[f'{state_codes[49]}_data_df'][1]
means_meds_df

WI


Unnamed: 0_level_0,2019_means,2020_means,p_vals,stats_signif
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
action_adventure,67.076923,67.078431,0.999225,No
scifi_fantasy,54.557692,60.490196,0.0363816,Yes
tv_shows,80.384615,77.647059,0.0626203,No
online_vids,83.076923,76.941176,3.9618e-05,Yes
mobiles,65.076923,59.803922,0.000564451,Yes
voice_vid_chat,41.615385,38.411765,0.260876,No
infectious_diseases,7.25,20.72549,2.92265e-06,Yes
health_news,44.826923,52.392157,0.00960556,Yes
games_systems_consoles,48.961538,56.568627,0.000665709,Yes
arts_entertainment,90.173077,83.72549,1.3864e-08,Yes


In [15]:
states_df_dict[f'{state_codes[49]}_data_df'][2]

[('scifi_fantasy', 0.03638155101062401),
 ('online_vids', 3.961799496019733e-05),
 ('mobiles', 0.0005644508870334183),
 ('infectious_diseases', 2.9226481069975736e-06),
 ('health_news', 0.009605562021610666),
 ('games_systems_consoles', 0.000665709128181784),
 ('arts_entertainment', 1.386402826914742e-08),
 ('books_lit', 0.004117605812143661),
 ('lottos', 0.0005142065561865732),
 ('celebs', 0.00014024411608071764),
 ('discrimination', 0.02193062856392299),
 ('weather', 0.02270266117185006),
 ('autos', 0.00315201959296557),
 ('beauty_fitness', 0.007995966611744162),
 ('sport_news', 1.0266544809071982e-11)]

In [19]:
for x in range(40, 51):
    print(state_codes[x])
    print(states_df_dict[f'{state_codes[x]}_data_df'][2])

SC
[('action_adventure', 0.037213368887122865), ('scifi_fantasy', 0.020561479597784815), ('tv_shows', 0.001723158935242587), ('online_vids', 8.082991587792804e-10), ('infectious_diseases', 8.313448149762176e-06), ('health_news', 8.172355066885259e-05), ('games_systems_consoles', 2.2536372559833473e-06), ('arts_entertainment', 1.1386869137802196e-05), ('books_lit', 0.0036629705074250007), ('lottos', 5.164235097920391e-06), ('celebs', 2.0464965258284322e-05), ('entertainment_media', 0.0409192335375911), ('discrimination', 0.0006658984716343508), ('sport_news', 1.2060797821829332e-10)]
SD
[('action_adventure', 0.02517127820578754), ('scifi_fantasy', 0.00842259325095429), ('tv_shows', 0.012613457326598672), ('online_vids', 0.027560319692913875), ('infectious_diseases', 3.6849899850973405e-06), ('arts_entertainment', 0.0026597091290213636), ('books_lit', 0.0005535286715042026), ('lottos', 0.0005213150936593188), ('celebs', 0.0007263319652055333), ('discrimination', 0.01967980766116836), ('w

In [21]:
states_df_dict[f'{state_codes[43]}_data_df'][0].describe()

Unnamed: 0,arts_entertainment,autos,beauty_fitness,books_lit,action_adventure,campaigns_elections,celebs,discrimination,entertainment_media,games_systems_consoles,...,law_enf,lottos,mobiles,online_vids,scifi_fantasy,sport_news,tv_shows,voice_vid_chat,weather,covid_cases
count,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,...,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,42.0
mean,88.058252,74.203883,82.990291,72.76699,71.07767,1.640777,59.621359,26.135922,73.61165,59.699029,...,51.796117,50.92233,56.747573,72.902913,56.07767,68.796117,80.038835,64.0,57.058252,488216.3
std,4.993771,7.238729,5.42777,12.717291,6.776096,9.871582,8.439074,13.357856,8.191618,12.109658,...,8.0188,8.549383,7.306616,7.716184,13.40814,16.417565,5.269078,10.875408,17.63843,472525.9
min,73.0,59.0,64.0,50.0,59.0,0.0,47.0,12.0,50.0,44.0,...,41.0,39.0,47.0,60.0,42.0,36.0,63.0,44.0,24.0,5.0
25%,86.0,70.0,80.0,60.5,67.0,0.0,54.0,19.5,69.0,51.0,...,48.0,45.0,53.0,67.0,48.0,59.0,77.0,56.5,44.0,49675.0
50%,89.0,73.0,84.0,76.0,70.0,0.0,58.0,22.0,73.0,54.0,...,51.0,50.0,55.0,73.0,51.0,73.0,80.0,63.0,57.0,409183.5
75%,91.0,78.0,87.0,82.0,74.5,1.0,63.0,27.5,77.0,66.0,...,54.0,54.5,57.5,77.0,59.0,80.5,83.0,69.0,70.0,785832.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,1581668.0


#### Guide to indices w/in states_df_dict:

index 0 is a df containing all info on a state (the categs info, the covid cases, the stay at home orders/gathering bans)

index 1 is a df showing the average pop rating for each category in 2019 and 2020, and the p value of an indep t-test for each categ, and whether the p-val is signif

index 2 is a list of all the cats w/ signif changes in cat pop, and their p-vals