## Perform Imports

In [None]:
# import dependencies
import pandas as pd
from os import listdir
from os.path import join, isfile
from random import sample

In [None]:
# import confidential information
from sys import path
path.insert(0, "..")
from config import eir_cleaned_destination_csv

## Define Functions

In [None]:
# this function is applied to the metadata's inspector and operator columns
def empname_column_str(row: pd.Series) -> None:
    
    # enfore lower case
    item_str = str(row).lower()
    
    # remove various characters
    if "." in item_str:
        item_str = item_str.replace(".", "")
    if ")" in item_str:
        item_str = item_str.replace(")", "")
    if "(" in item_str:
        item_str = item_str.replace("(", "")
    
    # remove special characters
    if "(" in item_str:
        item_str = item_str.replace("(", "")
    if ")" in item_str:
        item_str = item_str.replace(")", "")
    if "[" in item_str:
        item_str = item_str.replace("[", "")
    if "]" in item_str:
        item_str = item_str.replace("]", "")
    if "{" in item_str:
        item_str = item_str.replace("{", "")
    if "}" in item_str:
        item_str = item_str.replace("}", "")
    if "<" in item_str:
        item_str = item_str.replace("<", "")
    if ">" in item_str:
        item_str = item_str.replace(">", "")
    
    # replace special characters
    if "\\" in item_str:
        item_str = item_str.replace("\\", ",")
    if "/" in item_str:
        item_str = item_str.replace("/", ",")
    if " " in item_str:
        item_str = item_str.replace(" ", ",")
    if "-" in item_str:
        item_str = item_str.replace("-", ",")
    
    # return the value as a list of strings
    return [x for x in item_str.split(",") if x is not None and len(x) > 0 and x != "."]

In [None]:
# this function is applied to the metadata's item number column
def column_cleaner_itemnumber(row: pd.Series) -> None:
    
    # enforce lower case
    item_str = str(row)
    
    # remove white-space characters
    item_str = item_str.replace(" ", "")
    
    # check for and replace delimitor characters with
    rpl_chars = ["\\", "/", "(", ")"]
    delimitor = "%"
    for c in rpl_chars:
        if c in item_str:
            item_str = item_str.replace(c, delimitor)
    
    if "%" in item_str:
        return "|".join([x for x in item_str.split(delimitor) if x is not None and len(x) > 0])
    else:
        return item_str

## Read Raw Data

In [None]:
# read the two dataframes from csv
raw_metadata_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_metadata.csv"), low_memory = False)
raw_measurements_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_measurements.csv"), low_memory = False)

## Explore the Datasets

#### Show Unique Filtered Quantities

In [None]:
# inconsistencies to look out for...
# spaces
# braces {}
# brackets []
# parentheses ()
# backslash \\
# forwardslash /
# comma ,
# dashes -
# equals =
# periods .
# pipes |
characters = [" ", "{", "}", "[", "]", "(", ")", "<", ">", "\\", "/", ",", "=", ".", "|"]

# remove rows where the 'drawing' contains certain characters
red_metadata_df = raw_metadata_df.loc[
    (raw_metadata_df["drawing"].str.contains(" ") == False),:
]

# apply column functions
adj_metadata_df.loc[:, ["item_number"]] = red_metadata_df["item_number"].apply(column_cleaner_itemnumber)

print(f"Rows Lost: {len(raw_metadata_df) - len(adj_metadata_df)}")

# show all unique items in the columns
for column in adj_metadata_df.columns:
    my_list = [x for x in adj_metadata_df[column].unique() if any(i in str(x) for i in characters)]
    print("")
    print(f"----- {column}: {len(my_list)} -----")
    for item in my_list:
        print(str(item))