In [None]:
import os, sys
import pandas as pd
import numpy as np
import re
from googletrans import Translator
import sqlalchemy
from tqdm import tqdm, trange
from scipy.stats.stats import pearsonr  

## Read Data

In [None]:
db = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='twitterSuperUsers', query={'read_default_file': '~/.my.cnf', 'charset':'utf8mb4'})
db_lexica = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='dlatk_lexica', query={'read_default_file': '~/.my.cnf', 'charset':'utf8'})

engine = sqlalchemy.create_engine(db)

county_scores = pd.read_sql("feat$cat_individVsCollect_w$msgs_100u$cnty$1gra", con=engine)
state_scores = pd.read_sql("feat$cat_individVsCollect_w$msgs_100u$state$1gra", con=engine)

county_counts = pd.read_sql("feat$words_individVsCollect$msgs_100u$cnty$1gra", con=engine)
state_counts = pd.read_sql("feat$words_individVsCollect$msgs_100u$state$1gra", con=engine)

engine = sqlalchemy.create_engine(db_lexica)
lexicon = pd.read_sql("individVsCollect", con=engine)


In [None]:
#correlate the count of each individual word across states/counties and the overall lexicon score across states/counties
def validate_word(word, lexicon_score, word_counts):
    # print(lexicon_score)
    # print(word_counts)

    # only look at county/state scores that also have counts for the specified word
    lexicon_score_trunc = lexicon_score.loc[lexicon_score["group_id"].isin(word_counts["group_id"])]
    
    # sort by county/state id
    lexicon_score_trunc = lexicon_score_trunc.sort_values(by=["group_id"])
    word_counts = word_counts.sort_values(by=["group_id"]).drop_duplicates()

    # bypass words that do not show up in any Tweets 
    if(len(word_counts) == 0):
        return None
    
    # return Pearson correlation between overall score and word count
    return(pearsonr(lexicon_score_trunc["group_norm"], word_counts["group_norm"]))


# get correlations of each word in the lexicon with the overall lexicon score
def validate_lexicon(lexicon, lexicon_score, total_word_counts):
    correlations = {}
    for word in lexicon["term"]:
        word_counts = total_word_counts[total_word_counts["feat"] == word]
        correlations[word] = validate_word(word, lexicon_score, word_counts)
    return correlations

# get all relevant words in the lexicon and their correlation with the overall score
def get_valid_words(correlations_dict):
    valid_words = {}
    for c in correlations_dict:
        if(correlations_dict[c] is not None):
            valid_words[c] = correlations_dict[c][0]
        # else:
        #     print(c + ": " + str(correlations_dict[c]))
    return dict(sorted(valid_words.items(), key=lambda item: item[1]*-1))

In [None]:
lexicon_indv = lexicon[lexicon["category"] == "INDIVIDUALIST"]
lexicon_coll = lexicon[lexicon["category"] == "COLLECTIVIST"]

validate_indv_county = validate_lexicon(lexicon_indv, county_scores[county_scores["feat"]=="INDIVIDUALIST"], county_counts)
validate_coll_county = validate_lexicon(lexicon_coll, county_scores[county_scores["feat"]=="COLLECTIVIST"], county_counts)

validate_indv_state = validate_lexicon(lexicon_indv, state_scores[state_scores["feat"]=="INDIVIDUALIST"], state_counts)
validate_coll_state = validate_lexicon(lexicon_coll, state_scores[state_scores["feat"]=="COLLECTIVIST"], state_counts)

In [None]:
validated_indv_county = get_valid_words(validate_indv_county)
validated_coll_county = get_valid_words(validate_coll_county)
validated_indv_state = get_valid_words(validate_indv_state)
validated_coll_state = get_valid_words(validate_coll_state)

validated_indv_lexicon = {}
for word in lexicon_indv["term"]:
    score_1 = validate_indv_county.get(word, (0, 0)) or (0, 0)
    score_2 = validate_indv_state.get(word, (0, 0)) or (0, 0)
    validated_indv_lexicon[word] = (score_1[0] + score_2[0])/2
validated_indv_lexicon = dict(sorted(validated_indv_lexicon.items(), key=lambda item: item[1]*-1))

validated_coll_lexicon = {}
for word in lexicon_coll["term"]:
    score_1 = validate_coll_county.get(word, (0, 0)) or (0, 0)
    score_2 = validate_coll_state.get(word, (0, 0)) or (0, 0)
    validated_coll_lexicon[word] = (score_1[0] + score_2[0])/2
validated_coll_lexicon = dict(sorted(validated_coll_lexicon.items(), key=lambda item: item[1]*-1))