In [1]:
import os
import re
from pathlib import Path

import fasttext as ft
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm

ModuleNotFoundError: No module named 'fasttext'

In [31]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [23]:
PROJ = Path(os.path.realpath("."))
ROOT = PROJ.parent
DATA = ROOT / "data"

# Read and process data

In [54]:
# Classification A
clas_df_a = pd.read_excel(DATA / "raw/Categories 2019_2018.xlsx")
clas_df_a = clas_df_a[["MAIN_TAXABLE_ACTIVITY 2018", "MAIN_TAXABLE_ACTIVITY 2019"]]
# Create one unique classification
clas_a = pd.DataFrame(
    clas_df_a["MAIN_TAXABLE_ACTIVITY 2018"]
    .append(clas_df_a["MAIN_TAXABLE_ACTIVITY 2018"])
    .unique(),
    columns=["clas_a_title"],
)
# Remove text in parantheses
clas_a["clas_a_title"] = clas_a["clas_a_title"].str.replace(r"\(.*\)", "").str.strip()
# Remove nulls
clas_a = clas_a.dropna()
# Create ID
clas_a["clas_a_code"] = [f"a{str(x)}" for x in range(len(clas_a))]
# Export
clas_a.to_csv(DATA / "processed/clas_a.csv", index=False)
clas_a.head()

Unnamed: 0,clas_a_title,clas_a_code
0,Business Support Services,a0
1,Fishing,a1
2,Agriculture Employment,a2
3,Animal production,a3
4,Support activities of Mining,a4


In [83]:
# Classification B
sheets = ["HS2", "HS4", "NAICS2", "NAICS4"]
for sheet in sheets:
    clas_df_b = pd.read_excel(DATA / "raw/codes.xlsx", sheet_name=sheet, dtype=str)
    clas_df_b.columns = ["clas_b_code", "clas_b_title"]
    clas_df_b.to_csv(DATA / f"processed/clas_b_{sheet}.csv", index=False)
clas_df_b.head()

Unnamed: 0,clas_b_code,clas_b_title
0,1111,Oilseed and Grain Farming
1,1112,Vegetable and Melon Farming
2,1113,Fruit and Tree Nut Farming
3,1114,"Greenhouse, Nursery, and Floriculture Production"
4,1119,Other Crop Farming


# Get embeddings

## Preprocess text

In [32]:
en_stop = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.WordNetLemmatizer()
word_punctuation_tokenizer = nltk.WordPunctTokenizer()

In [33]:
def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r"\W", " ", str(document))

    # remove all single characters
    document = re.sub(r"\s+[a-zA-Z]\s+", " ", document)

    # Remove single characters from the start
    document = re.sub(r"\^[a-zA-Z]\s+", " ", document)

    # Substituting multiple spaces with single space
    document = re.sub(r"\s+", " ", document, flags=re.I)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = " ".join(tokens)
    word_tokenized_corpus = word_punctuation_tokenizer.tokenize(preprocessed_text)
    return word_tokenized_corpus

In [36]:
sent = preprocess_text(
    "Artificial intelligence, is the most advanced technology of the present era"
)
print(sent)

['artificial', 'intelligence', 'advanced', 'technology', 'present']


In [37]:
def preprocess_df(df, text_colname):
    final_corpus = [preprocess_text(doc) for doc in tqdm(df[text_colname])]
    return final_corpus

## Load model and obtain word vectors

In [42]:
ft_model = ft.load_model(str(ROOT / "models/cc.en.300.bin"))



In [43]:
def get_mean_word_vector(doc_text):
    if len(doc_text) > 0:
        word_vectors = np.array([ft_model.get_word_vector(x) for x in doc_text])
        result = np.nanmean(word_vectors, axis=0)
    else:
        result = np.array([np.nan] * ft_model.get_word_vector("").shape[0])
    return result

In [44]:
def vectorize_textlist(textlist):
    doc_vectors = np.array(
        [get_mean_word_vector(x) for x in tqdm(textlist, total=len(textlist))]
    )
    return doc_vectors

## Apply to data

In [86]:
clas_a = pd.read_csv(DATA / "processed/clas_a.csv")
clas_a.head()

Unnamed: 0,clas_a_title,clas_a_code
0,Business Support Services,a0
1,Fishing,a1
2,Agriculture Employment,a2
3,Animal production,a3
4,Support activities of Mining,a4


In [87]:
def get_wv_df(df, textcol):
    df_text = preprocess_df(df, textcol)
    vec = vectorize_textlist(df_text)
    vec_df = pd.concat([df, pd.DataFrame(vec)], axis=1)
    vec_df = vec_df.set_index(keys=list(df.columns))
    vec_df.columns = [str(x) for x in vec_df.columns]
    return vec_df

In [88]:
clas_a_vec = get_wv_df(clas_a, "clas_a_title")
clas_a_vec.to_parquet(DATA / "intermediate/clas_a_vec.parquet", index=True)
clas_a_vec.head()

100%|██████████| 187/187 [00:00<00:00, 8203.91it/s]
100%|██████████| 187/187 [00:00<00:00, 2623.78it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
clas_a_title,clas_a_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Business Support Services,a0,-0.025925,0.025082,0.01221,0.04168,-0.037457,0.041801,0.055334,0.005141,-0.022281,0.001679,...,0.030224,0.034443,-0.018788,0.03035,0.011706,0.009458,0.005382,0.043995,-0.007689,0.015419
Fishing,a1,0.067962,0.046946,0.048757,0.081487,-0.025844,0.041846,0.05249,-0.045384,0.019322,-0.050553,...,-0.040666,-0.004089,-0.001307,0.025173,-0.010149,-0.046725,-0.11087,0.111892,-0.01331,-0.068479
Agriculture Employment,a2,0.008636,0.021638,0.011066,0.013775,0.00163,0.02051,0.035688,0.008609,0.042362,0.00388,...,-0.000912,0.05352,-0.024568,0.040491,0.054232,-0.031612,-0.000425,0.036069,0.03319,0.009171
Animal production,a3,0.03675,0.033058,0.01889,0.080253,0.013234,-0.004848,0.049736,-0.008246,-0.018134,0.019848,...,-0.029893,0.039475,-0.038676,0.03565,0.064141,-0.022897,-0.0345,0.088468,0.039201,0.01525
Support activities of Mining,a4,-0.017894,0.035146,0.002398,0.047889,-0.005247,0.007037,0.041748,-0.031951,0.020205,-0.021872,...,0.014639,0.013979,-0.018705,0.036255,0.039297,-0.005247,0.022188,0.051874,0.015406,0.001576


In [89]:
clas_b_list = ["HS2", "HS4", "NAICS2", "NAICS4"]
clas_b_dfs = [
    pd.read_csv(DATA / f"processed/clas_b_{x}.csv", dtype=str) for x in clas_b_list
]
clas_b_dfs[0].head()

Unnamed: 0,clas_b_code,clas_b_title
0,1,Live animals
1,2,Meat
2,3,Fish
3,4,Diary products
4,5,Animal products


In [90]:
for clas_b, clas_b_type in zip(clas_b_dfs, clas_b_list):
    clas_b_vec = get_wv_df(clas_b, "clas_b_title")
    clas_b_vec.to_parquet(DATA / f"intermediate/clas_b_vec_{clas_b_type}.parquet", index=True)
    clas_b_vec.head()

100%|██████████| 102/102 [00:00<00:00, 1244.78it/s]
100%|██████████| 102/102 [00:00<00:00, 256.82it/s]
100%|██████████| 1248/1248 [00:00<00:00, 2665.82it/s]
100%|██████████| 1248/1248 [00:02<00:00, 417.67it/s]
100%|██████████| 17/17 [00:00<00:00, 21574.33it/s]
100%|██████████| 17/17 [00:00<00:00, 5092.72it/s]
100%|██████████| 311/311 [00:00<00:00, 20159.94it/s]
100%|██████████| 311/311 [00:00<00:00, 744.58it/s]
