In [20]:
import pandas as pd
import collections
import re
import json
import random

In [2]:
taxonomy = pd.read_csv("../csv/soc-mapped-expanded-taxonomy.csv", index_col=None)

In [3]:
taxonomy.head()

Unnamed: 0,profession,sense-name,sense-definition,primary-soc,secondary-soc
0,accountant,accountant.n.01,someone who maintains and audits business acco...,13:Business and Financial Operations Occupations,
1,acrobat,acrobat.n.01,an athlete who performs acts requiring skill a...,"27:Arts, Design, Entertainment, Sports, and Me...",
2,actor,actor.n.01,a theatrical performer,"27:Arts, Design, Entertainment, Sports, and Me...",
3,actor,actor.n.02,a person who acts and gets things done,"27:Arts, Design, Entertainment, Sports, and Me...",
4,actress,actress.n.01,a female actor,"27:Arts, Design, Entertainment, Sports, and Me...",


In [10]:
soc_values_dist = collections.Counter(taxonomy["primary-soc"].dropna().sort_values())
for soc_value, n in soc_values_dist.items():
    print(f"{soc_value:70s} = {n:3d}")

11:Management Occupations                                              =  63
13:Business and Financial Operations Occupations                       =   9
15:Computer and Mathematical Occupations                               =   3
17:Architecture and Engineering Occupations                            =   7
19:Life, Physical, and Social Science Occupations                      =  17
21:Community and Social Service Occupations                            =  45
23:Legal Occupations                                                   =  22
25:Educational Instruction and Library Occupations                     =  16
27:Arts, Design, Entertainment, Sports, and Media Occupations          = 131
29:Healthcare Practitioners and Technical Occupations                  =  27
31:Healthcare Support Occupations                                      =   2
33:Protective Service Occupations                                      =  70
35:Food Preparation and Serving Related Occupations                    =  13

In [47]:
df = taxonomy.groupby(["primary-soc", "sense-name"]).agg(len)["profession"].sort_values(ascending=False)
arr = collections.Counter(df[df >= 2].index.get_level_values(0)).items()
arr = sorted(arr, key=lambda x: x[1], reverse=True)
arr[:10]

[('33:Protective Service Occupations', 12),
 ('27:Arts, Design, Entertainment, Sports, and Media Occupations', 12),
 ('53:Transportation and Material Moving Occupations', 7),
 ('11:Management Occupations', 6),
 ('55:Military Specific Occupations', 4),
 ('51:Production Occupations', 4),
 ('23:Legal Occupations', 4),
 ('37:Building and Grounds Cleaning and Maintenance Occupations', 3),
 ('43:Office and Administrative Support Occupations', 3),
 ('39:Personal Care and Service Occupations', 3)]

In [45]:
n_soc = 5 # number of soc groups to show
n_senses = 5 # number of senses to show per soc group

soc_groups = [
    "33:Protective Service Occupations",
    "27:Arts, Design, Entertainment, Sports, and Media Occupations",
    "53:Transportation and Material Moving Occupations",
    "11:Management Occupations",
    "29:Healthcare Practitioners and Technical Occupations"
]
small_taxonomy = taxonomy[taxonomy["primary-soc"].isin(soc_groups)]

taxonomy_dict = {"name": "Taxonomy", "children": []}

for i, (soc, soc_df) in enumerate(small_taxonomy.groupby("primary-soc", dropna=True, sort=True)):
    soc = soc[3:]
    taxonomy_dict["children"].append({"name": soc, "children": []})

    sense_count = []
    for sense_name, sense_df in soc_df.groupby("sense-name"):
        sense_count.append((sense_name, len(sense_df)))
    sense_count = sorted(sense_count, key=lambda x: x[1], reverse=True)
    sense_names = [x[0] for x in sense_count[:n_senses]]

    small_soc_df = soc_df[soc_df["sense-name"].isin(sense_names)]

    for j, ((sense_name, sense_definition), sense_df) in enumerate(small_soc_df.groupby(
        ["sense-name", "sense-definition"], dropna=True, sort=True)):
        taxonomy_dict["children"][i]["children"].append({"name": f"{sense_name} - {sense_definition}", 
                                                            "children": []})
        for _, row in sense_df.iterrows():
            taxonomy_dict["children"][i]["children"][j]["children"].append({"name": row["profession"]})

In [46]:
json.dump(taxonomy_dict, open("../d3/flare.json", "w"), indent=2)