In [1]:
import os, sys
import pandas as pd
import numpy as np
import re
from googletrans import Translator
import sqlalchemy
from tqdm import tqdm, trange
from scipy.stats.stats import pearsonr, spearmanr  
from scipy.stats import zscore
import random
import pingouin as pg
import math
from sklearn.preprocessing import normalize

### Read in relevant files/MySQL tables

In [2]:
db_twitter = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='twitterSuperUsers', query={'read_default_file': '~/.my.cnf', 'charset':'utf8mb4'})
db_lexica = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='dlatk_lexica', query={'read_default_file': '~/.my.cnf', 'charset':'utf8'})
db_county = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='county_data', query={'read_default_file': '~/.my.cnf', 'charset':'utf8'})

engine = sqlalchemy.create_engine(db_twitter)
county_scores = pd.read_sql("feat$cat_individVsCollectFinal$msgs_100u$cnty$1gra", con=engine)
state_scores = pd.read_sql("feat$cat_individVsCollectFinal$msgs_100u$state$1gra", con=engine)

engine = sqlalchemy.create_engine(db_lexica)
lexicon = pd.read_sql("individVsCollectFinal", con=engine)

engine = sqlalchemy.create_engine(db_county)
county_mapping = pd.read_sql("county_by_state_reg_div", con=engine)
community_mapping = pd.read_sql("superCrossWalk_FINAL_FINAL", con=engine)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
Set the environment variable OUTDATED_RAISE_EXCEPTION=1 for a full traceback.
  **kwargs
  **kwargs


### Utils

In [78]:
#go from county name --> fips code
def name_to_county(name):
    #get state name
    state = name.split(",")[-1].strip()
    if(state == "Puerto Rico"): 
        return None
    #get county name
    county = (name.split(",")[0]).split("County")[0].strip()
    county = county.split("Municipality")[0].strip()
    county = county.split("Borough")[0].strip()
    county = county.split("Parish")[0].strip()
    county = county.replace("city", "City")
    county = county.replace("ñ", "n")
   
    temp = county_mapping[county_mapping["state"] == state]
    temp = temp[temp["county"] == county]["cnty"]
    try:
        return np.array(temp)[0]
    except:
        print("Couldn't map to fips code: ", name)
        return None

#calculate correlation between two county-level data points
def calculate_correlation(data1, data2):
    overlapping_keys = np.intersect1d(list(data1.keys()), list(data2.keys()))
    values1 = [data1[x] for x in overlapping_keys]
    values2 = [data2[x] for x in overlapping_keys]
    return pearsonr(values1, values2)

def get_county_map():
    mapping = {}
    with open("GCI Data/counties.txt") as f:
        lines = [line.rstrip() for line in f]
        for l in lines:
            parts = l.split(",")
            mapping[parts[1]] = int(parts[0])
    return mapping

#get state-aggregated data for county-level data (average over all counties)
def get_state_data(county_data):
    mapping = get_county_map()
    state_data = {}
    for c in county_data:
        try:
            state = list((county_mapping[county_mapping["cnty"] == c])["state"])[0]
        except:
            continue
        if state not in state_data.keys():
            state_data[state] = []
        state_data[state].append(county_data[c])       
    
    state_data_final = {}
    for s in state_data:
        state_data_final[s] = np.average(state_data[s])
    return normalize_data(state_data_final)

def normalize_data(data):
    keys = data.keys()
    vals = np.array([data[x] for x in keys])
    normalized_vals = zscore(vals) 
    return dict(zip(keys,normalized_vals))

#get correlation between single (list) and (list of lists) at the county level
def validate_data(input_data, other_data):
    overlapping_keys = list(input_data.keys())
    for d in other_data:
        overlapping_keys = np.intersect1d(list(d.keys()), overlapping_keys)
    
    data_values = np.array([input_data[x] for x in overlapping_keys])
    other_data_intersect = []
    for d in other_data:
        other_data_intersect.append(np.array([d[x] for x in overlapping_keys]))   
    print(pearsonr(data_values, sum(other_data_intersect)))   

def get_overlapping_keys(dicts):
    overlapping_keys = list(dicts[0].keys())
    for d in dicts:
        overlapping_keys = np.intersect1d(list(d.keys()), overlapping_keys)
    return overlapping_keys

def process_dicts(dicts):
    keys = get_overlapping_keys(dicts)
    processed_lists = []   
    for d in dicts:
        processed_list = np.array([d[x] for x in keys])
        processed_lists.append(processed_list)
    return keys, processed_lists

def get_community_mapping():
    cnty = community_mapping["cnty"]
    ACP_name = community_mapping["ACP_name"]
    mappings = {}   
    communities = np.array(ACP_name)
    communities = np.unique(communities[communities != np.array(None)])
    for i in range(len(cnty)):      
        mappings[cnty[i]] = ACP_name[i]
    return communities, mappings

def zero_one_normalize(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))
    

In [4]:
income_data = pd.read_csv("GCI Data/income_census.csv")
income_data.set_index("Label (Grouping)", inplace=True)
income_data = income_data.T

income_string = "Median earnings (dollars) for full-time, year-round workers with earnings"

#remove unecessary data
temp_columns = income_data.columns
for c in temp_columns:
    if (income_string in c):
        income_data["income"] = income_data[c]
income_data.drop(columns=temp_columns, inplace=True)

income_control = {}
for index, row in income_data.iterrows():
    if("Total!!Estimate" not in index): continue
    name = index.split("!!")[0]
    fips = name_to_county(name)
    if fips is not None:                       
        try:
            income = int(row["income"].replace(",", ""))
            income_control[fips] = income
        except:
            continue
income_control = normalize_data(income_control)

Couldn't map to fips code:  United States


### Indicator 1: Fertility Rate

##### Total fertility rate for each county, US Census, county level

In [5]:
fertility_data = pd.read_csv("GCI Data/fertility_census.csv")
fertility_data.set_index("Label (Grouping)", inplace=True)
rate_string = "!!Women with births in the past 12 months !!Rate per 1,000 women!!Estimate"

#remove unecessary data
for c in fertility_data.columns:
    if(rate_string in c):
        name = c.split(rate_string)[0]
        fertility_data[name] = fertility_data[c]
    fertility_data.drop(columns=[c], inplace=True)

fertility_data = fertility_data.T
columns_temp = fertility_data.columns

#keep only relevant columns
fertility_data["all"] = fertility_data["Women 15 to 50 years"]
fertility_data["15-19"] = fertility_data["15 to 19 years"]
fertility_data["20-34"] = fertility_data["20 to 34 years"]
fertility_data["35-50"] = fertility_data["35 to 50 years"]
fertility_data.drop(columns=columns_temp, inplace=True)

In [6]:
def get_tfr(row):
    try:
        return (int(row["15-19"])*5 + int(row["20-34"])*15 + int(row["35-50"])*16)/1000
    except:
        return None

tfr_data = {}
for index,row in fertility_data.iterrows():
    tfr = get_tfr(row)
    fips = name_to_county(index)
    if(tfr is not None and fips is not None):
        tfr_data[fips] = tfr

tfr_data = normalize_data(tfr_data)

### Indicator 2: Living Arrangements

##### Number of households with grandparents living with grandchildren, US Census, county level

In [7]:
household_data = pd.read_csv("GCI Data/household_census.csv")
household_data.set_index("Label (Grouping)", inplace=True)
household_data = household_data.T

coll_string = "Household with grandparents living with grandchildren:"

#remove unecessary data
temp_columns = household_data.columns
temp_columns = temp_columns.drop("Total:")
for c in temp_columns:
    if (coll_string in c):
        household_data["total_coll"] = household_data[c]
household_data.drop(columns=temp_columns, inplace=True)

living_data = {}
for index, row in household_data.iterrows():
    name = index.split("!!")[0]
    fips = name_to_county(name)
    if fips is not None:                       
        try:
            total = int(row["Total:"].replace(",", ""))
            total_coll = int(row["total_coll"].replace(",", ""))
            living_data[fips] = (total_coll)/total
        except:
            continue

living_data = normalize_data(living_data)

Couldn't map to fips code:  United States


### Indicator 3: Stability of Marriage

##### Ratio of married individuals to divorced individuals 15 years and over (sum of men and women), US Census, county level

In [8]:
marital_data = pd.read_csv("GCI Data/divorce_census.csv")
marital_data.set_index("Label (Grouping)", inplace=True)
marital_data = marital_data.T

total_string_male = "Males 15 years and over"
total_string_female = "Females 15 years and over"
divorce_string = "Divorced"
separated_string = "Separated"
married_string = "Now married, except separated"

#remove unecessary data
temp_columns = marital_data.columns
for c in temp_columns:
    if(divorce_string in c):
        label = (c.split("("))[1].split(")")[0]
        marital_data["num_divorced_"+label] = marital_data[c]
    elif(married_string in c):
        label = (c.split("("))[1].split(")")[0]
        marital_data["num_married_"+label] = marital_data[c]
    elif(separated_string in c):
        label = (c.split("("))[1].split(")")[0]
        marital_data["num_separated_"+label] = marital_data[c]
    elif(total_string_male in c):
        marital_data["total_male"] = marital_data[c]
    elif(total_string_female in c):
        marital_data["total_female"] = marital_data[c]
marital_data.drop(columns=temp_columns, inplace=True)

marriage_data = {}
for index, row in marital_data.iterrows():
    if("Percent" in index): continue
    name = index.split("!!")[0]
    fips = name_to_county(name)
    if fips is not None:                       
        try:
            married = int(row["num_married_male"].replace(",", "")) + int(row["num_married_female"].replace(",", ""))
            divorced = int(row["num_divorced_male"].replace(",", "")) + int(row["num_divorced_female"].replace(",", "")) 
            separated = int(row["num_separated_male"].replace(",", "")) + int(row["num_separated_female"].replace(",", ""))
            total = int(row["total_male"].replace(",", "")) + int(row["total_female"].replace(",", ""))
            marriage_data[fips] = married/divorced
        except:
            continue

marriage_data = normalize_data(marriage_data)

Couldn't map to fips code:  United States


### Indicator 4: Religiosity

##### Score for question "For each of the following aspects, indicate how important it is in your life. Would you say it is very important, rather important, not very important or not important at all? – Religion" (scale of 1-4) 
##### Inverse weighted so high importance --> high score, WVS Survey, state level
##### Standard devation per state = 1.02 --> enforcing that each state must have > 10 responses

In [9]:
wvs_data = pd.read_csv("GCI Data/WVS_Cross-National_Wave_7.csv")
wvs_data_usa = wvs_data[wvs_data["B_COUNTRY_ALPHA"] == "USA"]

wvs_state_mapping = {}
with open("GCI Data/wvs_states.txt") as f:
    lines = [line.rstrip() for line in f]
for l in lines:
    parts = l.split(" ")
    wvs_state_mapping[int(parts[0])] = " ".join(parts[3:])    

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
religion_data_temp = {}

for index, row in wvs_data_usa.iterrows():
    state = wvs_state_mapping[row["N_REGION_WVS"]]
    religious_importance = row["Q6"]
    if(religious_importance > 0):
        if(state not in religion_data_temp.keys()):
            religion_data_temp[state] = []
        religion_data_temp[state].append(religious_importance)

religion_data = {}
std_devs = []
for s in religion_data_temp:
    if(len(religion_data_temp[s]) > 10):
        religion_data[s] = 1/np.average(religion_data_temp[s])
    std_devs.append(np.std(religion_data_temp[s]))
print("STD DEV:", np.average(std_devs))
    
religion_data = normalize_data(religion_data)

STD DEV: 1.0217751798668622


### Indicator 5: Collective Transportation

##### Average number of cars per household, inverse weights so high number of cars --> low score, US Census, county level

In [11]:
transportation_data = pd.read_csv("GCI Data/transportation_census.csv")
transportation_data.set_index("Label (Grouping)", inplace=True)
transportation_data = transportation_data.T

temp_columns = transportation_data.columns
temp_columns = temp_columns.drop("Total:")
for c in temp_columns:
    col_name = c.strip()
    transportation_data[col_name] = transportation_data[c]
transportation_data.drop(columns=temp_columns, inplace=True)

def get_cars_per_household(row):
    total = int(row["Total:"].replace(",", ""))
    none = int(row["No vehicle available"].replace(",", ""))
    car1 = int(row["1 vehicle available"].replace(",", ""))
    car2 = int(row["2 vehicles available"].replace(",", ""))
    car3 = int(row["3 vehicles available"].replace(",", ""))
    car4 = int(row["4 or more vehicles available"].replace(",", ""))
    return (car1 + car2*2 + car3*3 + car4*4)/total

car_data = {}
for index, row in transportation_data.iterrows():
    name = index.split("!!")[0]
    fips = name_to_county(name)
    if fips is not None:                       
        try:
            car_data[fips] = 1/get_cars_per_household(row)
        except:
            continue
car_data = normalize_data(car_data)

Couldn't map to fips code:  United States


### Indicator 6: Ingroup Bias

##### Score for question "Do you agree, disagree or neither agree nor disagree with the following statements? - When jobs are scarce, employers should give priority to people of this country over immigrants." (scale of 1-5) 
##### Inverse weighted so high agreement --> high score, WVS Survey, state level
##### Average standard devation per state = 0.82 --> enforcing each state must have > 5 responses

In [12]:
compatriotism_data_temp = {}

for index, row in wvs_data_usa.iterrows():
    state = wvs_state_mapping[row["N_REGION_WVS"]]
    bias = row["Q34"]
    if(bias > 0):
        if(state not in compatriotism_data_temp.keys()):
            compatriotism_data_temp[state] = []
        compatriotism_data_temp[state].append(bias)

compatriotism_data = {}
std_devs = []
for s in compatriotism_data_temp:
    compatriotism_data[s] = 1/np.average(compatriotism_data_temp[s]) 
    std_devs.append(np.std(compatriotism_data_temp[s]))
print("STD DEV:", np.average(std_devs))

compatriotism_data = normalize_data(compatriotism_data)

STD DEV: 0.8247487844563858


### Aggregate Data

##### All data at the county level

In [13]:
county_data = [tfr_data, living_data, marriage_data, car_data]
county_labels = ["fertility", "grandparents", "marriage", "cars"]
#make county df
fips_codes, state_data_processed = process_dicts(county_data)
county_df = pd.DataFrame(columns=county_labels)
for i in range(len(county_labels)):
    county_df[county_labels[i]] = state_data_processed[i]
county_df.set_index(fips_codes, inplace=True)

print("county df shape: ", county_df.shape)
print("Pairwise Pearson Correlations at the county level (all)")
display(county_df.corr())
print("Cronbach alpha")
print(pg.cronbach_alpha(data=county_df))

county df shape:  (741, 4)
Pairwise Pearson Correlations at the county level (all)


Unnamed: 0,fertility,grandparents,marriage,cars
fertility,1.0,0.193518,0.029378,-0.129945
grandparents,0.193518,1.0,0.044293,-0.029113
marriage,0.029378,0.044293,1.0,-0.144604
cars,-0.129945,-0.029113,-0.144604,1.0


Cronbach alpha
(-0.03172253562599832, array([-0.159,  0.084]))


##### All data at the state level

In [14]:
tfr_data_state = get_state_data(tfr_data)
living_data_state = get_state_data(living_data)
marriage_data_state = get_state_data(marriage_data)
car_data_state = get_state_data(car_data)
income_data_state = get_state_data(income_control)

state_data = [tfr_data_state, living_data_state, marriage_data_state, car_data_state, religion_data, compatriotism_data]
state_labels = ["fertility", "grandparents", "marriage", "cars", "religion", "compatriotism"]
#make state df
fips_codes, state_data_processed = process_dicts(state_data)
state_df = pd.DataFrame(columns=state_labels)
for i in range(len(state_labels)):
    state_df[state_labels[i]] = state_data_processed[i]
state_df.set_index(fips_codes, inplace=True)

print("state df shape: ",  state_df.shape)
print("Pairwise Pearson Correlations at the state level (all)")
display(state_df.corr())
print("Cronbach alpha")
print(pg.cronbach_alpha(data=state_df))

state df shape:  (42, 6)
Pairwise Pearson Correlations at the state level (all)


Unnamed: 0,fertility,grandparents,marriage,cars,religion,compatriotism
fertility,1.0,0.044625,-0.0296,-0.48979,0.007723,-0.252338
grandparents,0.044625,1.0,-0.186644,-0.050615,0.343947,0.16882
marriage,-0.0296,-0.186644,1.0,-0.080827,-0.232621,-0.128473
cars,-0.48979,-0.050615,-0.080827,1.0,-0.191369,-0.050225
religion,0.007723,0.343947,-0.232621,-0.191369,1.0,0.749607
compatriotism,-0.252338,0.16882,-0.128473,-0.050225,0.749607,1.0


Cronbach alpha
(0.03246216873252079, array([-0.504,  0.424]))


##### Ran experiments with every subset of >3 variables at the state level to maximize Cronbach's Alpha. Result: Include living arrangements, religiosity, and compatriotism

In [15]:
state_data = [living_data_state, religion_data, compatriotism_data]
state_labels = ["grandparents", "religion", "compatriotism"]

# state_data = [tfr_data_state, living_data_state, religion_data, compatriotism_data]
# state_labels = ["fertility", "grandparents", "religion", "compatriotism"]

#make state df
fips_codes, state_data_processed = process_dicts(state_data)
state_df = pd.DataFrame(columns=state_labels)
for i in range(len(state_labels)):
    state_df[state_labels[i]] = state_data_processed[i]
state_df.set_index(fips_codes, inplace=True)

print("state df shape: ",  state_df.shape)
print("Pairwise Pearson Correlations at the state level")
display(state_df.corr())
print("Cronbach alpha")
print(pg.cronbach_alpha(data=state_df))

state_data = [living_data_state, religion_data, compatriotism_data, income_data_state]
state_labels = ["grandparents", "religion", "compatriotism", "income"]
#make state df
fips_codes, state_data_processed = process_dicts(state_data)
state_df = pd.DataFrame(columns=state_labels)
for i in range(len(state_labels)):
    state_df[state_labels[i]] = state_data_processed[i]
state_df.set_index(fips_codes, inplace=True)

print("Pairwise Pearson Correlations at the state level (including income)")
display(state_df.corr())
state_df.corr().to_csv("test.csv")
print("Partial correlations controlling for income")
display(state_df.pcorr())


state df shape:  (42, 3)
Pairwise Pearson Correlations at the state level


Unnamed: 0,grandparents,religion,compatriotism
grandparents,1.0,0.343947,0.16882
religion,0.343947,1.0,0.749607
compatriotism,0.16882,0.749607,1.0


Cronbach alpha
(0.70165249761289, array([0.502, 0.83 ]))
Pairwise Pearson Correlations at the state level (including income)


Unnamed: 0,grandparents,religion,compatriotism,income
grandparents,1.0,0.343947,0.16882,-0.256767
religion,0.343947,1.0,0.749607,-0.413523
compatriotism,0.16882,0.749607,1.0,-0.180937
income,-0.256767,-0.413523,-0.180937,1.0


Partial correlations controlling for income


Unnamed: 0,grandparents,religion,compatriotism,income
grandparents,1.0,0.265671,-0.118308,-0.10686
religion,0.265671,1.0,0.752806,-0.380808
compatriotism,-0.118308,0.752806,1.0,0.198764
income,-0.10686,-0.380808,0.198764,1.0


### Regression Correlation Analysis

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import math

print("COUNTY LEVEL: what amount of variance in one variable can be explained by the other variables")
_, data = process_dicts(county_data)
print(county_labels)

X = np.array(data[1:3]).T
y = np.array(data[0])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

X_a = np.array(data[0:1])
X_b = np.array(data[2:3])
X = np.concatenate([X_a, X_b]).T
y = np.array(data[1])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

X_a = np.array(data[0:2])
X_b = np.array([data[3]])
X = np.concatenate([X_a, X_b]).T
y = np.array(data[2])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

X = np.array(data[0:3]).T
y = np.array(data[3])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

COUNTY LEVEL: what amount of variance in one variable can be explained by the other variables
['fertility', 'grandparents', 'marriage', 'cars']
(0.19463552238206278, 9.271893541015743e-08)
(0.19733491878351186, 6.095010368196594e-08)
(0.15009215017902852, 4.09475802784803e-05)
(0.1916402327149177, 1.4667170058511747e-07)


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import math

print("STATE LEVEL: what amount of variance in one variable can be explained by the other variables")
_, data = process_dicts(state_data)
print(state_labels[:-1])

X = np.array(data[1:2]).T
y = np.array(data[0])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

X_a = np.array(data[0:1])
X_b = np.array(data[2:])
X = np.concatenate([X_a, X_b]).T
y = np.array(data[1])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))

X = np.array(data[0:2]).T
y = np.array(data[2])
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
print(pearsonr(y, pred))


STATE LEVEL: what amount of variance in one variable can be explained by the other variables
['grandparents', 'religion', 'compatriotism']
(0.3439465074872927, 0.025727988276223662)
(0.8167209701066344, 4.2571467276119256e-11)
(0.7555757790641874, 7.306801203417557e-09)


### Looking at Collectivism/Individualism Scores: County level

In [18]:
coll_scores_county = county_scores[county_scores["feat"] == "COLLECTIVISM"]
coll_scores_county = dict(zip(coll_scores_county.group_id, coll_scores_county.group_norm))

indv_scores_county = county_scores[county_scores["feat"] == "INDIVIDUALISM"]
indv_scores_county = dict(zip(indv_scores_county.group_id, indv_scores_county.group_norm))

county_data=[tfr_data, living_data, marriage_data, car_data, coll_scores_county, indv_scores_county, income_control]
county_labels = ["fertility", "grandparents", "marriage", "cars", "collectivism", "individualism", "income"]

#make county df
fips_codes, state_data_processed = process_dicts(county_data)
county_df = pd.DataFrame(columns=county_labels)
for i in range(len(county_labels)):
    county_df[county_labels[i]] = zero_one_normalize(state_data_processed[i])
county_df.set_index(fips_codes, inplace=True)

print("county df shape: ", county_df.shape)
print("Pairwise Pearson Correlations at the county level (all)")
display(county_df.corr())
print("Cronbach alpha")
print(pg.cronbach_alpha(data=county_df))


county df shape:  (740, 7)
Pairwise Pearson Correlations at the county level (all)


Unnamed: 0,fertility,grandparents,marriage,cars,collectivism,individualism,income
fertility,1.0,0.192716,0.0287,-0.129584,0.056068,-0.187889,-0.163251
grandparents,0.192716,1.0,0.040996,-0.027164,-0.037043,-0.35087,-0.205674
marriage,0.0287,0.040996,1.0,-0.143842,-0.17555,0.1731,0.52268
cars,-0.129584,-0.027164,-0.143842,1.0,0.078122,-0.021528,0.082803
collectivism,0.056068,-0.037043,-0.17555,0.078122,1.0,0.108789,0.015457
individualism,-0.187889,-0.35087,0.1731,-0.021528,0.108789,1.0,0.435143
income,-0.163251,-0.205674,0.52268,0.082803,0.015457,0.435143,1.0


Cronbach alpha
(0.18967829713442358, array([0.097, 0.276]))


In [82]:
county_data = [coll_scores_county, indv_scores_county]
county_labels = ["collectivism", "individualism"]

#make county df
fips_codes, state_data_processed = process_dicts(county_data)
county_df = pd.DataFrame(columns=county_labels)
for i in range(len(county_labels)):
    county_df[county_labels[i]] = zero_one_normalize(state_data_processed[i])
county_df.set_index(fips_codes, inplace=True)
county_df["cnty"] = fips_codes
county_df["diff"] = zero_one_normalize(county_df["collectivism"]-county_df["individualism"])
print(len(county_df))

print(county_df.head())
engine = sqlalchemy.create_engine(db_county)
county_df.to_sql("cnty_coll_indv_outcomes", con=engine, index=False, if_exists='replace', chunksize=50000)

2042
      collectivism  individualism  cnty      diff
0         0.260847       0.230808     0  0.335455
1001      0.503897       0.194162  1001  0.541831
1003      0.530369       0.270219  1003  0.505245
1005      0.641623       0.190335  1005  0.646277
1007      0.538226       0.224859  1007  0.544511


In [68]:
max(county_df["collectivism"])

1.0

### Looking at Collectivism/Individualism Scores: State level

In [69]:
coll_scores_state = get_state_data(coll_scores_county)
indv_scores_state = get_state_data(indv_scores_county)

state_data = [living_data_state, religion_data, compatriotism_data, coll_scores_state, indv_scores_state, income_data_state]
state_labels = ["grandparents", "religion", "compatriotism", "collectivism", "individualism", "income"]

#make state df
states, state_data_processed = process_dicts(state_data)
state_df = pd.DataFrame(columns=state_labels)
for i in range(len(state_labels)):
    state_df[state_labels[i]] = zscore(state_data_processed[i])
state_df.set_index(states, inplace=True)

print("Pairwise Pearson Correlations at the state level")
display(state_df.corr())

state_df.to_csv("test.csv")

Pairwise Pearson Correlations at the state level


Unnamed: 0,grandparents,religion,compatriotism,collectivism,individualism,income
grandparents,1.0,0.343947,0.16882,0.200423,-0.29103,-0.256767
religion,0.343947,1.0,0.749607,0.399528,-0.658271,-0.413523
compatriotism,0.16882,0.749607,1.0,0.464271,-0.51301,-0.180937
collectivism,0.200423,0.399528,0.464271,1.0,-0.470284,-0.273389
individualism,-0.29103,-0.658271,-0.51301,-0.470284,1.0,0.424323
income,-0.256767,-0.413523,-0.180937,-0.273389,0.424323,1.0


### Looking at Collectivism/Individualism Scores: Community level

In [99]:
_, mappings = get_community_mapping()

coll_scores_community = {}
indv_scores_community = {}

for fips, row in county_df.iterrows():
    try:
        c = mappings[fips]
        if(c is None): continue
        if(c not in coll_scores_community.keys()):
            coll_scores_community[c] = []
            indv_scores_community[c] = []
        coll_scores_community[c].append(row["collectivism"])
        indv_scores_community[c].append(row["individualism"])
    except(KeyError):
        continue

communities = list(coll_scores_community.keys())
for c in communities:
    if(len(coll_scores_community[c]) < 20):
        del coll_scores_community[c]
        del indv_scores_community[c]
    else:
        print(c, " & ", len(coll_scores_community[c]), "\\\\")

community_df = pd.DataFrame()
communities = list(coll_scores_community.keys())
community_df["community"] = communities
community_df["collectivism"] = zero_one_normalize([np.median(coll_scores_community[x]) for x in communities])
community_df["individualism"] = zero_one_normalize([np.median(indv_scores_community[x]) for x in communities])

community_df.to_csv("community_scores.csv")

Exurbs 207
Graying America 164
African American South 252
Evangelical Hubs 269
Working Class Country 159
Military Posts 70
Urban Suburbs 103
College Towns 151
Big Cities 46
Hispanic Centers 87
Rural Middle America 403
Middle Suburbs 77


### Vandello-Cohen Correlation

In [86]:
vc_scores = pd.read_csv("GCI Data/vandello_cohen.csv")
vc_scores = dict(zip(vc_scores.State, vc_scores.Score))

state_data = [vc_scores, coll_scores_state, indv_scores_state]
state_labels = ["vandello-cohen", "collectivism", "individualism"]

#make state df
states, state_data_processed = process_dicts(state_data)
state_df = pd.DataFrame(columns=state_labels)
for i in range(len(state_labels)):
    state_df[state_labels[i]] = zero_one_normalize(state_data_processed[i])
state_df.set_index(states, inplace=True)

print("Pairwise Pearson Correlations at the state level")
display(state_df.corr())



Pairwise Pearson Correlations at the state level


Unnamed: 0,vandello-cohen,collectivism,individualism
vandello-cohen,1.0,0.38833,-0.374328
collectivism,0.38833,1.0,-0.363711
individualism,-0.374328,-0.363711,1.0


In [23]:
db_indv_coll = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='individualism_collectivism', query={'read_default_file': '~/.my.cnf', 'charset':'utf8'})

engine = sqlalchemy.create_engine(db_indv_coll)
interpolations = pd.read_sql("ic2s2_interpolations", con=engine)

cnty = interpolations["cnty"]
diff_interpolated = interpolations["diff_interpolated"]

norm_cnty = []
norm_diff_interpolated = []
test = []
for i in range(len(cnty)):
    if(diff_interpolated[i] > 0 or diff_interpolated[i] <= 0):
        norm_cnty.append(cnty[i])
        norm_diff_interpolated.append(diff_interpolated[i])
        test.append(1)
norm_diff_interpolated = zero_one_normalize(norm_diff_interpolated)
print(len(norm_diff_interpolated))
normalized_diff = pd.DataFrame()
normalized_diff["cnty"] = norm_cnty
normalized_diff["diff"] = norm_diff_interpolated
normalized_diff["test"] = test

normalized_diff.to_sql("ic2s2_interpolations_normalized", con=engine, index=False, if_exists='replace', chunksize=50000)


  """Entry point for launching an IPython kernel.


3134
