In [None]:
import os, sys
import pandas as pd
import numpy as np
import re
from googletrans import Translator
import sqlalchemy
from tqdm import tqdm, trange
from scipy.stats.stats import pearsonr  

## Read Data

In [None]:
db = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='individualism_collectivism', query={'read_default_file': '~/.my.cnf', 'charset':'utf8mb4'})
db_lexica = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='dlatk_lexica', query={'read_default_file': '~/.my.cnf', 'charset':'utf8'})

engine = sqlalchemy.create_engine(db)

county_scores = pd.read_sql("feat$cat_ablation_04_075_w$msgs_100u$cnty$1gra", con=engine)
state_scores = pd.read_sql("feat$cat_individVsCollect_w$msgs_100u$state$1gra", con=engine)

county_counts = pd.read_sql("feat$1gram$msgs_100u$cnty$ablation_all_words", con=engine)
state_counts = pd.read_sql("feat$words_individVsCollect$msgs_100u$state$1gra", con=engine)

lexicon = pd.read_csv("ablation/ablation_0.4_0.75.csv")


In [None]:
#correlate the count of each individual word across states/counties and the overall lexicon score across states/counties
def validate_word(word, lexicon_score, word_counts):
    # only look at county/state scores that also have counts for the specified word
    lexicon_score_trunc = lexicon_score.loc[lexicon_score["group_id"].isin(word_counts["group_id"])]
    
    # sort by county/state id
    lexicon_score_trunc = lexicon_score_trunc.sort_values(by=["group_id"])
    word_counts = word_counts.sort_values(by=["group_id"]).drop_duplicates()

    # bypass words that do not show up in any Tweets 
    if(len(word_counts) == 0):
        print("No Tweets for " + word)
        return None
    
    # return Pearson correlation between overall score and word count
    return(pearsonr(lexicon_score_trunc["group_norm"], word_counts["group_norm"]))


# get correlations of each word in the lexicon with the overall lexicon score
def validate_lexicon(lexicon, lexicon_score, total_word_counts):
    correlations = {}
    for word in lexicon["WORD"]:
        word_counts = total_word_counts[total_word_counts["feat"] == word]
        correlations[word] = validate_word(word, lexicon_score, word_counts)
    return correlations

# get all relevant words in the lexicon and their correlation with the overall score
def get_valid_words(correlations_dict):
    valid_words = {}
    for c in correlations_dict:
        if(correlations_dict[c] is not None):
            valid_words[c] = correlations_dict[c][0]
    return dict(sorted(valid_words.items(), key=lambda item: item[1]*-1))

In [None]:
lexicon_indv = lexicon[lexicon["CATEGORY"] == "INDIVIDUALISM"]
lexicon_coll = lexicon[lexicon["CATEGORY"] == "COLLECTIVISM"]

validate_indv_county = validate_lexicon(lexicon_indv, county_scores[county_scores["feat"]=="INDIVIDUALISM"], county_counts)
validate_coll_county = validate_lexicon(lexicon_coll, county_scores[county_scores["feat"]=="COLLECTIVISM"], county_counts)

validated_indv_county = get_valid_words(validate_indv_county)
validated_coll_county = get_valid_words(validate_coll_county)

thresholds = [0.1, 0.15, 0.2, 0.25, 0.3]
for t in thresholds:
    FILENAME = "purification_coll_{}.csv".format(t)
    df = pd.DataFrame(columns=['WORD', 'CATEGORY','WEIGHT'])
    for word in validated_coll_county:
        if(validated_coll_county[word] > t):
            df = df.append({'WORD': word, 'CATEGORY': 'COLLECTIVISM', 'WEIGHT': validated_coll_county[word]}, ignore_index=True)
    df.to_csv(FILENAME, index=False)