In [None]:
import numpy as np
import os
import pandas as pd
import pickle
import re
import time
from collections import Counter, defaultdict
from sklearn.linear_model import LogisticRegression

In [None]:
def separate_chunks(list_, n_per_chunk_):
    """Handle output from R"""
    for i in range(0, len(list_), n_per_chunk_):  
        yield list_[i:i + n_per_chunk_]
        

def docs_as_dists(topic_list_):
    """Represent each document as a distribution of topic probabilities"""
    topic_lengths = set(len(topic_) for topic_ in topic_list_)
    assert len(topic_lengths) == 1
    response_vecs_ = []
    num_topics = len(topic_list_)
    topic_len = list(topic_lengths)[0]
    for i in range(topic_len):
        vec = []
        for j in range(num_topics):
            val = topic_list_[j][i]
            vec.append(val)
        response_vecs_.append(vec)
    return response_vecs_


def tjur_r2_ALL(X_df, y_df):
    """
    Tjur, T. (2009). Coefficients of determination in logistic regression 
    models—a new proposal: The coefficient of discrimination. The American 
    Statistician, 63(4),366-372. DOI: 10.1198/tast.2009.08210
    """
    count = 0
    r2s = []
    y_cols = y_df.columns
    X_cols = X_df.columns
    X = X_df[X_cols].to_numpy()
    for y_col in y_cols:
        y = y_df[y_col].values
        r2_row = []
        for x_col in X_cols:
            X = X_df[x_col].values.reshape(-1,1)
            logit = LogisticRegression(penalty='none', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, max_iter=200, multi_class='auto', verbose=0).fit(X, y)
            yhat = logit.predict_proba(X)
            yhat = [yh[1] for yh in yhat]
            response = pd.DataFrame(list(zip(y, yhat)), columns=["y", "yhat"])
            r2 = np.mean(response[response["y"]==True]["yhat"]) - np.mean(response[response["y"]==False]["yhat"])
            r2_row.append(r2)
            if r2 >= 0.2:
                print(x_col, y_col)
            count += 1
        X = X_df[X_cols].to_numpy()
        logit = LogisticRegression(penalty='none', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, max_iter=200, multi_class='auto', verbose=0).fit(X, y)
        yhat = logit.predict_proba(X)
        yhat = [yh[1] for yh in yhat]
        response = pd.DataFrame(list(zip(y, yhat)), columns=["y", "yhat"])
        r2_full = np.mean(response[response["y"]==True]["yhat"]) - np.mean(response[response["y"]==False]["yhat"])
        r2_row.append(r2_full)
        r2s.append(r2_row)
    return np.array(r2s)

In [None]:
path_to_sims = "simulated_corpora/"

fs1 = [f for f in os.listdir(path_to_sims) if "ngrams_df_sim" in f]

fs2 = [f for f in os.listdir(path_to_sims) if "topicmodel_k15_sim" in f]
fs3 = [f for f in os.listdir(path_to_sims) if "topic_probs_k15_sim" in f]
fs4 = [f for f in os.listdir(path_to_sims) if "_lda15gamma" in f]
fs5 = [f for f in os.listdir(path_to_sims) if "_lda15beta" in f]
fs6 = [f for f in os.listdir(path_to_sims) if "likeliest_terms_k15" in f]

fs7 = [f for f in os.listdir(path_to_sims) if "topicmodel_k35_sim" in f]
fs8 = [f for f in os.listdir(path_to_sims) if "topic_probs_k35_sim" in f]
fs9 = [f for f in os.listdir(path_to_sims) if "_lda35gamma" in f]
fs10 = [f for f in os.listdir(path_to_sims) if "_lda35beta" in f]
fs11 = [f for f in os.listdir(path_to_sims) if "likeliest_terms_k35" in f]

In [None]:
fs = [fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10, fs11]

for f in fs:
    print(len(f))

In [None]:
reg = "sim[0-9]{1,4}"

sim_nums = []

for i, f in enumerate(fs[1:]):
    f = [int(re.findall(reg, f_)[0].replace("sim", "")) for f_ in f]
    sim_nums += f

c = Counter(sim_nums)
for value in c.values():
    assert value == 10 # assert simulation number occurs only once in each file list
    
sim_nums = set(sim_nums)
print(len(sim_nums))
print(min(sim_nums))
print(max(sim_nums))

missing = [i for i in range(1,1085) if i not in sim_nums]
print(len(missing))

In [None]:
df15 = pd.read_csv("df_for_comparison_k15.csv")

In [None]:
len(df15.columns)

In [None]:
cols = list(df15.columns)[13:-18]
cols.remove("Index Codes Applied")
print(cols)

In [None]:
orig_cols = ['Academic Medicine Applied',
 'Culture of medicine Applied',
 'Expectations Applied',
 'Hospital/clinic hours and environment Applied',
 'Incentive/payment structure Applied',
 'Interpersonal Applied',
 'Job changes Applied',
 'Medical training Applied',
 'Missed opportunities Applied',
 'Pay/Compensation Applied',
 'Psychological Applied',
 'Sub-specialities Applied',
 'Great quote/example Applied',
 'Motherhood Specific Applied',
 'Motherhood Specific/Breastfeeding/Pumping Applied',
 'Motherhood Specific/Childcare/Household challenges Applied',
 'Motherhood Specific/Family leave Applied']

In [None]:
assert cols == orig_cols

In [None]:
df_qual = df15[cols]

In [None]:
df_qual.head()

In [None]:
topic_cols = [f"topic_{i}" for i in range(1,36)]

In [None]:
count = 0
lengths = set()

cell_types = set()
zero_cells = 0

r2s_dict_k15 = defaultdict(lambda: defaultdict(lambda: []))
r2s_dict_k35 = defaultdict(lambda: defaultdict(lambda: []))

start_time = time.time()

for sim in sorted(list(sim_nums)):
    
    count += 1
    
    f = f"{path_to_sims}/topic_probs_k15_sim{sim}.txt"
    doc15 = open(f, "r").read().split()
    topic_list15 = [chunk for chunk in separate_chunks(doc15, 988)]
    response_vecs15 = docs_as_dists(topic_list15)
    lengths.add(len(response_vecs15))

    df_k15 = pd.DataFrame(response_vecs15, columns=topic_cols[:15])
    
    r2s = tjur_r2_ALL(df_k15, df_qual)
    
    for row_i in range(r2s.shape[0]):
        for col_j in range(r2s.shape[1]):
            cell_r2 = r2s[row_i][col_j]
            cell_types.add(type(cell_r2))
            if cell_r2 == 0.0:
                zero_cells += 1
            r2s_dict_k15[row_i][col_j].append(cell_r2)   
    
    f = f"{path_to_sims}/topic_probs_k35_sim{sim}.txt"
    doc35 = open(f, "r").read().split()
    topic_list35 = [chunk for chunk in separate_chunks(doc35, 988)]
    response_vecs35 = docs_as_dists(topic_list35)
    lengths.add(len(response_vecs35))

    df_k35 = pd.DataFrame(response_vecs35, columns=topic_cols)
    
    r2s = tjur_r2_ALL(df_k35, df_qual)
    
    for row_i in range(r2s.shape[0]):
        for col_j in range(r2s.shape[1]):
            cell_r2 = r2s[row_i][col_j]
            cell_types.add(type(cell_r2))
            if cell_r2 == 0.0:
                zero_cells += 1
            r2s_dict_k35[row_i][col_j].append(cell_r2)
            
    if count % 100 == 0:
        print(f"Processed {count}.", "{:.2f} seconds elapsed.".format(time.time() - start_time))
        
print("Finished. {:.2f} seconds elapsed.".format(time.time() - start_time))

print(cell_types) 
print(count)
print(lengths)

In [None]:
k15_dict = defaultdict(lambda: {})
for key_i in r2s_dict_k15.keys():
    for key_j in r2s_dict_k15[key_i].keys():
        cell_r2s = r2s_dict_k15[key_i][key_j]
        k15_dict[key_i][key_j] = cell_r2s
        
k35_dict = defaultdict(lambda: {})
for key_i in r2s_dict_k35.keys():
    for key_j in r2s_dict_k35[key_i].keys():
        cell_r2s = r2s_dict_k35[key_i][key_j]
        k35_dict[key_i][key_j] = cell_r2s
        
k15_dict = dict(k15_dict)
k35_dict = dict(k35_dict)

In [None]:
for i in range(17):
    for j in range(15):
        assert k15_dict[i][j] == r2s_dict_k15[i][j]
    for j in range(35):
        assert k35_dict[i][j] == r2s_dict_k35[i][j]

In [None]:
lengths = set()

print(len(r2s_dict_k15.keys())) # 17 codes
print(len(r2s_dict_k15[0].keys())) # 15 topics + full model = 16

for row, col in r2s_dict_k15.items():
    for cell in col.values():
        lengths.add(len(cell))
        
print(len(r2s_dict_k35.keys())) # 17 codes
print(len(r2s_dict_k35[0].keys())) # 35 topics + full model = 36
        
for row, col in r2s_dict_k35.items():
    for cell in col.values():
        lengths.add(len(cell))
        
print(lengths)

In [None]:
outf = "simulations_tjur_r2_k15.d"
pickle.dump(k15_dict, open(outf, "wb"))

outf = "simulations_tjur_r2_k35.d"
pickle.dump(k35_dict, open(outf, "wb"))