### Perform Imports

In [1]:
# import dependencies
import pandas as pd
import datetime
from os import listdir
from os.path import join, isfile
from random import sample

In [2]:
# import eir functions
from eir_functions import clean_item_number, \
                          clean_drawing, \
                          clean_revision, \
                          clean_inspection_date, \
                          clean_inspector_operator, \
                          clean_disposition, \
                          clean_supplier, \
                          clean_receiver_number, \
                          clean_purchase_order

In [3]:
# import confidential information
from sys import path
path.insert(0, "..")
from config import eir_cleaned_destination_csv

### Define Functions

In [4]:
# this function prints out unique dataframe column values that contain certain values
def print_unique_values(df: pd.DataFrame, find_vals: list, show_cols: list = []) -> None:

    # define what columns are printed
    column_names = []
    if len(show_cols) > 0:
        column_names = show_cols
    else:
        column_names = df.columns

    # show all unique items in the column(s)
    for column in column_names:
        unique_list = [x for x in df[column].unique() if any(i in str(x) for i in find_vals)]
        nan_count = df[column].isna().sum()
        print("")
        print(f"----- {column}: {len(unique_list):,.0f} (NaN: {nan_count:,.0f}) -----")
        for item in unique_list:
            print(str(item))

### Build Functional Objects

In [5]:
# metadata function object
meta_func_obj = {
    "item_number": {
        "func": clean_item_number,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [" "],
            "replace_delimitors": ["\\", "/", "(", ")"]
        }
    },
    "drawing": {
        "func": clean_drawing,
        "args": {
            "none_if_contains": [" ", "."],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "revision": {
        "func": clean_revision,
        "args": {
            "none_if_contains": [" ", "-", "/"],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "inspection_date": {
        "func": clean_inspection_date,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "inspector": {
        "func": clean_inspector_operator,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [".", "(", ")", "{", "}", "[", "]", "<", ">"],
            "replace_delimitors": ["\\", "/", " ", "-", ","]
        }
    },
    "disposition": {
        "func": clean_disposition,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "supplier": {
        "func": clean_supplier,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "receiver_number": {
        "func": clean_receiver_number,
        "args": {
            "none_if_contains": ["no"],
            "remove_substrings": [" "],
            "replace_delimitors": ["-", "/", ","]
        }
    },
    "purchase_order": {
        "func": clean_purchase_order,
        "args": {
            "none_if_contains": ["no"],
            "remove_substrings": [" "],
            "replace_delimitors": ["-", "/", ","]
        }
    },
    "job_order": {
        "func": None,
        "args": {
            "none_if_contains": [" ", ".", "-"],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "operator": {
        "func": clean_inspector_operator,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [".", "(", ")", "{", "}", "[", "]", "<", ">"],
            "replace_delimitors": ["\\", "/", " ", "-", ","]
        }
    },
    "full_inspect_qty": {
        "func": None,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "received_qty": {
        "func": None,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    },
    "completed_qty": {
        "func": None,
        "args": {
            "none_if_contains": [],
            "remove_substrings": [],
            "replace_delimitors": []
        }
    }
}

### Read Raw Data

In [6]:
# read the two dataframes from csv
raw_metadata_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_metadata.csv"), low_memory = False)
raw_measurements_df = pd.read_csv(join(eir_cleaned_destination_csv, "raw_measurements.csv"), low_memory = False)

### Explore the Datasets

##### Show Unique Filtered Quantities

In [7]:
# characters to view by
find_vals = [" ", "{", "}", "[", "]", "(", ")", "<", ">", "\\", "/", ",", "=", ".", "|", "-"]

# toggle these comments to experiment with what special characters 'clutter' a particular column
# find_vals.remove(" ")
# find_vals.remove("{")
# find_vals.remove("}")
# find_vals.remove("[")
# find_vals.remove("]")
# find_vals.remove("(")
# find_vals.remove(")")
# find_vals.remove("<")
# find_vals.remove(">")
# find_vals.remove("\\")
# find_vals.remove("/")
# find_vals.remove(",")
# find_vals.remove("=")
# find_vals.remove(".")
# find_vals.remove("|")
# find_vals.remove("-")

# print the unique values that intersect with the find_vals list contents
print_unique_values(raw_metadata_df, find_vals, show_cols = ["item_number"])


----- item_number: 1,716 (NaN: 5) -----
3004-111
9004-152
9004-153
1844-157
9004-154
3220-111
9039-165
9033-220
9033-221
9033-217
9039-162
9039-178
9039-179
9039-174
9033-303
7720-108
CMR5-102
CMR5-101
9033-219
9033-224
MSLB16-101
9029-200
9029-238
9029-220
9029-201
9029-203
9029-221
9029-204
9029-205
9029-222
9029-215
9029-223
9029-216
9029-217
9029-224
9029-218
9029-219
9029-226
9029-208
9029-209
9029-227
9029-211
9029-228
9063-105
9014-334
9014-336
9017-101
9022-101
1210-101Z
1210-102Z
9063-106
9014-335
9014-337
9017-102
9022-102
9063-107
9063-111
9063-109
9063-108
1210-103Z
1210-104Z
1210-105
1210-105Z
1210-106Z
1210-108Z
1210-109Z
1210-110Z
9063-110
9033-226
9039-142 / 9039-143
9029-196
9033-222
1299-117
9006-104
9006-105
9032-142
9035-140
9029-295
9063-113
9029-294
1210-114Z
1210-115Z
1844-113
1844-120
1844-122
1844-128
1844-147
1844-119
1844-121
1844-138
1844-142
9032-145
1844-108
1844-127
1844-117
1844-116
1845-105
1299-118
9029-298
9029-119
9039-196
9039-200
9033-218
9039-198

## Clean the Datasets

### Metadata

This cell turns all unwanted values into `None` for easier handling down the road.

In [8]:
# create a deep copy of the raw dataframe
std_metadata_df = raw_metadata_df.copy(deep = True)

# apply the metadata function object to standardize 'unwanted' values
for k in meta_func_obj:

    # reference the object children
    my_func = meta_func_obj[k]["func"]
    my_args = meta_func_obj[k]["args"]

    if my_func is not None:
        std_metadata_df.loc[:, k] = raw_metadata_df[k].apply(my_func, args = (my_args,))

# characters to view by
find_vals = [" ", "{", "}", "[", "]", "(", ")", "<", ">", "\\", "/", ",", "=", ".", "|", "-"]

# toggle these comments to experiment with what special characters 'clutter' a particular column
# find_vals.remove(" ")
# find_vals.remove("{")
# find_vals.remove("}")
# find_vals.remove("[")
# find_vals.remove("]")
# find_vals.remove("(")
# find_vals.remove(")")
# find_vals.remove("<")
# find_vals.remove(">")
# find_vals.remove("\\")
# find_vals.remove("/")
# find_vals.remove(",")
# find_vals.remove("=")
# find_vals.remove(".")
# find_vals.remove("|")
# find_vals.remove("-")

# print the unique values that intersect with the find_vals list contents
print_unique_values(std_metadata_df, find_vals, show_cols = ["item_number"])


----- item_number: 1,715 (NaN: 5) -----
3004-111
9004-152
9004-153
1844-157
9004-154
3220-111
9039-165
9033-220
9033-221
9033-217
9039-162
9039-178
9039-179
9039-174
9033-303
7720-108
cmr5-102
cmr5-101
9033-219
9033-224
mslb16-101
9029-200
9029-238
9029-220
9029-201
9029-203
9029-221
9029-204
9029-205
9029-222
9029-215
9029-223
9029-216
9029-217
9029-224
9029-218
9029-219
9029-226
9029-208
9029-209
9029-227
9029-211
9029-228
9063-105
9014-334
9014-336
9017-101
9022-101
1210-101z
1210-102z
9063-106
9014-335
9014-337
9017-102
9022-102
9063-107
9063-111
9063-109
9063-108
1210-103z
1210-104z
1210-105
1210-105z
1210-106z
1210-108z
1210-109z
1210-110z
9063-110
9033-226
9039-142|9039-143
9029-196
9033-222
1299-117
9006-104
9006-105
9032-142
9035-140
9029-295
9063-113
9029-294
1210-114z
1210-115z
1844-113
1844-120
1844-122
1844-128
1844-147
1844-119
1844-121
1844-138
1844-142
9032-145
1844-108
1844-127
1844-117
1844-116
1845-105
1299-118
9029-298
9029-119
9039-196
9039-200
9033-218
9039-198
9

This cell removes `None` values from certain columns.

In [15]:
# record the row count before reduction
rc_initial = std_metadata_df.shape[0]

# create a reduced dataframe from the standardized dataframe
red_metadata_df = std_metadata_df.loc[
    (std_metadata_df["item_number"].isna() == False) & 
    (std_metadata_df["drawing"].isna() == False) & 
    (std_metadata_df["revision"].isna() == False), :
]

# record the row count after reduction
rc_reduced = red_metadata_df.shape[0]

# show how many rows were lost to the reduction
print(f"Rows Lost: {rc_initial - rc_reduced:,.0f}")

Rows Lost: 104
