In [None]:
%reset -f

In [2]:
import os
import random
import re
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist, pdist, squareform
from tqdm import tqdm

In [3]:
PROJ = Path(os.path.realpath("."))
ROOT = PROJ.parent
DATA = ROOT / "data"

In [4]:
# Read embeddings
clas_a = pd.read_parquet(DATA / "intermediate/clas_a_vec.parquet")
clas_b_list = ["HS2", "HS4", "NAICS2", "NAICS4"]
clas_b_dfs = [
    pd.read_parquet(DATA / f"intermediate/clas_b_vec_{x}.parquet") for x in clas_b_list
]
clas_a.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
clas_a_title,clas_a_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Business Support Services,a0,-0.025925,0.025082,0.01221,0.04168,-0.037457,0.041801,0.055334,0.005141,-0.022281,0.001679,...,0.030224,0.034443,-0.018788,0.03035,0.011706,0.009458,0.005382,0.043995,-0.007689,0.015419
Fishing,a1,0.067962,0.046946,0.048757,0.081487,-0.025844,0.041846,0.05249,-0.045384,0.019322,-0.050553,...,-0.040666,-0.004089,-0.001307,0.025173,-0.010149,-0.046725,-0.11087,0.111892,-0.01331,-0.068479
Agriculture Employment,a2,0.008636,0.021638,0.011066,0.013775,0.00163,0.02051,0.035688,0.008609,0.042362,0.00388,...,-0.000912,0.05352,-0.024568,0.040491,0.054232,-0.031612,-0.000425,0.036069,0.03319,0.009171
Animal production,a3,0.03675,0.033058,0.01889,0.080253,0.013234,-0.004848,0.049736,-0.008246,-0.018134,0.019848,...,-0.029893,0.039475,-0.038676,0.03565,0.064141,-0.022897,-0.0345,0.088468,0.039201,0.01525
Support activities of Mining,a4,-0.017894,0.035146,0.002398,0.047889,-0.005247,0.007037,0.041748,-0.031951,0.020205,-0.021872,...,0.014639,0.013979,-0.018705,0.036255,0.039297,-0.005247,0.022188,0.051874,0.015406,0.001576


In [5]:
def get_topn(clas_a, clas_b, nlargest=5):
    similarity_df = pd.DataFrame(
        1 - cdist(clas_a.values, clas_b.values, metric="cosine"),
        index=clas_a.index,
        columns=clas_b.index,
    )
    order = np.argsort(-similarity_df.values, axis=1)[:, :nlargest]
    clas_b_codes = [x for x, y in similarity_df.columns]
    clas_b_names = [y for x, y in similarity_df.columns]
    result_names = pd.DataFrame(
        np.array(clas_b_names)[order],
        columns=["top{}".format(i) for i in range(1, nlargest + 1)],
        index=similarity_df.index,
    )
    result_codes = pd.DataFrame(
        np.array(clas_b_codes)[order],
        columns=["top{}".format(i) for i in range(1, nlargest + 1)],
        index=similarity_df.index,
    )
    return result_codes, result_names

In [7]:
for clas_b, clas_b_type in zip(clas_b_dfs, clas_b_list):
    nlargest = 5
    topn_codes, topn_titles = get_topn(clas_a, clas_b, nlargest)
    topn_codes.to_csv(DATA / f"processed/top{str(nlargest)}_codes_{clas_b_type}.csv", index=True)
    topn_titles.to_csv(DATA / f"processed/top{str(nlargest)}_titles_{clas_b_type}.csv", index=True)

In [9]:
topn_titles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,top1,top2,top3,top4,top5
clas_a_title,clas_a_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business Support Services,a0,Business Support Services,Other Support Services,Educational Support Services,Facilities Support Services,Other Information Services
Fishing,a1,Fishing,Hunting and Trapping,Aquaculture,Hog and Pig Farming,"Deep Sea, Coastal, and Great Lakes Water Trans..."
Agriculture Employment,a2,"Agriculture, Construction, and Mining Machiner...",Hog and Pig Farming,Employment Services,Support Activities for Forestry,Cattle Ranching and Farming
Animal production,a3,Other Animal Production,Animal Slaughtering and Processing,Support Activities for Animal Production,Animal Food Manufacturing,Poultry and Egg Production
Support activities of Mining,a4,Support Activities for Mining,Support Activities for Forestry,"Agriculture, Construction, and Mining Machiner...",Support Activities for Air Transportation,Other Support Activities for Transportation
