## Perform Imports

In [None]:
# import dependencies
import pandas as pd
import datetime
from os import listdir
from os.path import join, isfile
from random import sample

In [None]:
# import confidential information
from sys import path
path.insert(0, "..")
from config import eir_cleaned_destination_csv

## Define Functions

In [None]:
# this function is applied to the metadata's item number column
def clean_itemnumber(row: pd.Series) -> None:
    if row is not None:

        # enforce lowercase
        item_str = str(row).lower()

        # remove white-space characters
        item_str = item_str.replace(" ", "")

        # check for and replace delimitor characters with
        rpl_chars = ["\\", "/", "(", ")"]
        delimitor = "%"
        for c in rpl_chars:
            if c in item_str:
                item_str = item_str.replace(c, delimitor)

        if "%" in item_str:
            return "|".join([x for x in item_str.split(delimitor) if x is not None and len(x) > 0])
        else:
            return item_str
    else:
        return None

In [None]:
# this function is applied to the metadata's inspection date column
def clean_inspectiondate(row: pd.Series) -> None:
    if row is not None:

        # enforce lowercase
        item_str = str(row).lower()

        # return the date object if it can be converted
        if "/" in item_str:
            return datetime.datetime.strptime(item_str, "%m/%d/%Y").date()
        else:
            return None
    else:
        return None

In [None]:
# this function is applied to the metadata's inspector and operator columns
def clean_names(row: pd.Series) -> None:
    if row is not None:

        # enfore lower case
        item_str = str(row).lower()

        # remove various characters
        if "." in item_str:
            item_str = item_str.replace(".", "")
        if ")" in item_str:
            item_str = item_str.replace(")", "")
        if "(" in item_str:
            item_str = item_str.replace("(", "")

        # remove special characters
        if "(" in item_str:
            item_str = item_str.replace("(", "")
        if ")" in item_str:
            item_str = item_str.replace(")", "")
        if "[" in item_str:
            item_str = item_str.replace("[", "")
        if "]" in item_str:
            item_str = item_str.replace("]", "")
        if "{" in item_str:
            item_str = item_str.replace("{", "")
        if "}" in item_str:
            item_str = item_str.replace("}", "")
        if "<" in item_str:
            item_str = item_str.replace("<", "")
        if ">" in item_str:
            item_str = item_str.replace(">", "")

        # replace special characters
        if "\\" in item_str:
            item_str = item_str.replace("\\", "%")
        if "/" in item_str:
            item_str = item_str.replace("/", "%")
        if " " in item_str:
            item_str = item_str.replace(" ", "%")
        if "-" in item_str:
            item_str = item_str.replace("-", "%")
        if "," in item_str:
            item_str = item_str.replace(",", "%")

        # return a splitable string
        if "%" in item_str:
            arr = [str(x) for x in item_str.split("%") if x is not None and len(x) > 0 and x != "."]
            if not any([x.isnumeric() for x in arr]):
                return '|'.join(arr)
            else:
                return None
        else:
            return item_str
    else:
        return None

In [None]:
# this function is applied to the metadata's receiver number column
def clean_receiver(row: pd.Series) -> None:
    if row is not None:

        # enforce lowercase and remove whitespace characters
        item_str = str(row).lower().replace(" ", "")

        # replace variants of 'no receiver' with None
        if "no" in item_str:
            return None

        # prepare for delimiting
        if "-" in item_str:
            item_str = item_str.replace("-", "%")
        if "/" in item_str:
            item_str = item_str.replace("/", "%")
        if "," in item_str:
            item_str = item_str.replace(",", "%")

        # delimit
        if "%" in item_str:
            arr = [x for x in item_str.split("%") if x is not None and len(x) > 0]
            i = 1
            while i < len(arr):

                # make sure both items are numeric
                if arr[i - 1].isnumeric() and arr[i].isnumeric():

                    # make sure the preceding number contains more digits than the following number
                    if len(arr[i - 1]) > len(arr[i]):
                        arr[i] = f"{arr[i - 1][:len(arr[i - 1]) - len(arr[i])]}{arr[i]}"
                i += 1

            return "|".join(arr)
        else:
            return item_str

In [None]:
# this function is applied to the metadata's purchase order column
def clean_purchase(row: pd.Series) -> None:
    if row is not None:
        
        # enforce lowercase and remove whitespace characters
        item_str = str(row).lower().replace(" ", "")
        
        # replace variants of 'no job order' with None
        if "no" in item_str:
            return None
        
        # prepare or delimiting
        if "-" in item_str:
            item_str = item_str.replace("-", "%")
        if "/" in item_str:
            item_str = item_str.replace("/", "%")
        if "," in item_str:
            item_str = item_str.replace(",", "%")
        
        # only permit values with 3 letters at the beginning
        if item_str[:3].isalpha():
            if "%" in item_str:
                arr = [x for x in item_str.split("%") if x is not None and len(x) > 1]
                i = 1
                while i < len(arr):
                    
                    if arr[i - 1][-3:].isnumeric() and arr[i].isnumeric():
                        arr[i] = f"{arr[i - 1][:3]}{arr[i]}"
                    else:
                        return None

                    i += 1

                return "|".join(arr)
            else:
                return item_str
        else:
            return None

## Read Raw Data

In [None]:
# read the two dataframes from csv
raw_metadata_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_metadata.csv"), low_memory = False)
raw_measurements_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_measurements.csv"), low_memory = False)

## Explore the Datasets

#### Show Unique Filtered Quantities

In [None]:
# characters to view by
characters = [" ", "{", "}", "[", "]", "(", ")", "<", ">", "\\", "/", ",", "=", ".", "|", "-"]

# get the initial row count
initial_row_count = raw_metadata_df.shape[0]

# remove targeted rows
red_metadata_df = raw_metadata_df.loc[
    (raw_metadata_df["item_number"].str.contains(" ") == False) & 
    (raw_metadata_df["drawing"].str.contains(" |\.", regex = True) == False) & 
    (raw_metadata_df["revision"].str.contains("-| |/", regex = True) == False), :
]

# deep copy the dataframe object
cln_metadata_df = red_metadata_df.copy(deep = True)

# clean the targeted columns
cln_metadata_df.loc[:, "item_number"] = red_metadata_df["item_number"].apply(clean_itemnumber)
cln_metadata_df.loc[:, "drawing"] = red_metadata_df["drawing"].apply(lambda x: str(x).lower() if x is not None else None)
cln_metadata_df.loc[:, "inspection_date"] = red_metadata_df["inspection_date"].apply(clean_inspectiondate)
cln_metadata_df.loc[:, "inspector"] = red_metadata_df["inspector"].apply(clean_names)
cln_metadata_df.loc[:, "disposition"] = red_metadata_df["disposition"].apply(lambda x: str(x).lower() if x is not None else None)
cln_metadata_df.loc[:, "receiver_number"] = red_metadata_df["receiver_number"].apply(clean_receiver)
cln_metadata_df.loc[:, "purchase_order"] = red_metadata_df["purchase_order"].apply(clean_purchase)
# cln_metadata_df.loc[:, "job_order"] = red_metadata_df["job_order"].apply(clean_purchase)
cln_metadata_df.loc[:, "operator"] = red_metadata_df["operator"].apply(clean_names)

# get the final row count
final_row_count = cln_metadata_df.shape[0]

# report the rows lost
print(f"Rows Lost: {initial_row_count - final_row_count:,.0f}")

# specify what columns to show, leave empty if you want all
col_names0 = ["job_order"]
col_names1 = cln_metadata_df.columns
column_names = []
if len(col_names0) > 0:
    column_names = col_names0
else:
    column_names = col_names1

# show all unique items in the column(s)
for column in column_names:
    my_list = [x for x in cln_metadata_df[column].unique() if any(i in str(x) for i in characters)]
    print("")
    print(f"----- {column}: {len(my_list)} -----")
    for item in my_list:
        print(str(item))