In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import re
import sys
from pathlib import Path

import fasttext as ft
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
PROJ = Path(os.path.realpath("."))
ROOT = PROJ.parent
DATA = ROOT / "data"

In [4]:
sys.path.append(str(ROOT / "src"))

In [5]:
from get_closest_match import prepare_data_and_embeddings, process_data_and_match

# Read data

In [6]:
# Classification A
clas_a_df = pd.read_csv(DATA / "processed/clas_a.csv")
clas_a_df = clas_a_df.drop(columns="clas_a_code")
clas_a_df.head()

Unnamed: 0,clas_a_title
0,Business Support Services
1,Fishing
2,Agriculture Employment
3,Animal production
4,Support activities of Mining


In [7]:
# Classification B
sheets = ["HS2", "HS4", "NAICS2", "NAICS4"]

clas_b_df = pd.DataFrame()
for sheet in sheets:
    clas_b_df_sheet = pd.read_excel(
        DATA / "raw/codes.xlsx", sheet_name=sheet, dtype=str
    )
    clas_b_df_sheet.columns = ["clas_b_code", "clas_b_title"]
    clas_b_df_sheet["clas_b_type"] = sheet
    clas_b_df = clas_b_df.append(clas_b_df_sheet)
clas_b_df["clas_b_code"] = clas_b_df["clas_b_type"] + "_" + clas_b_df["clas_b_code"]
clas_b_df.head()

Unnamed: 0,clas_b_code,clas_b_title,clas_b_type
0,HS2_1,Live animals,HS2
1,HS2_2,Meat,HS2
2,HS2_3,Fish,HS2
3,HS2_4,Diary products,HS2
4,HS2_5,Animal products,HS2


# Test

In [8]:
result_dict = process_data_and_match(
    clas_a_df,
    clas_b_df,
    titlecol_a="clas_a_title",
    titlecol_b="clas_b_title",
    codecol_b="clas_b_code",
    n_best = 5
)

Pre-processing text


  0%|          | 0/186 [00:00<?, ?it/s]

Preparing embeddings


100%|██████████| 186/186 [00:01<00:00, 169.64it/s]
100%|██████████| 186/186 [00:00<00:00, 6742.21it/s]
100%|██████████| 1678/1678 [00:00<00:00, 49218.80it/s]
100%|██████████| 1678/1678 [00:00<00:00, 13273.01it/s]


In [9]:
result_dict["names"].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,top1,top2,top3,top4,top5
clas_a_title,codecol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business Support Services,a0,Business Support Services,Other Support Services,Educational Support Services,Facilities Support Services,Other Information Services
Fishing,a1,Fishing,Fishing and hunting equipment,"Agriculture, Forestry, Fishing and Hunting",Fishing vessels,Fish
Agriculture Employment,a2,"Agriculture, Construction, and Mining Machiner...",Hog and Pig Farming,Employment Services,"Agriculture, Forestry, Fishing and Hunting",Other agricultural machinery
Animal production,a3,Other Animal Production,Other animal fats and oils,Animal products n.e.c.,Animal products,"Animal or vegetable fats, oils or waxes"
Support activities of Mining,a4,Support Activities for Mining,"Mining, Quarrying, and Oil and Gas Extraction",Support Activities for Forestry,"Agriculture, Construction, and Mining Machiner...",Other Support Activities for Transportation


In [10]:
result_dict["scores"].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,top1,top2,top3,top4,top5
clas_a_title,codecol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business Support Services,a0,1.0,0.919244,0.889297,0.863508,0.788597
Fishing,a1,1.0,0.858864,0.810416,0.789181,0.639044
Agriculture Employment,a2,0.660291,0.638609,0.638341,0.635391,0.623924
Animal production,a3,1.0,0.922842,0.874221,0.874221,0.862017
Support activities of Mining,a4,1.0,0.785229,0.776676,0.760678,0.756821
