In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regex
from datetime import datetime

In [None]:
## frequency of parts overall and across sets
inv_parts = pd.read_csv("../data/raw/inventory_parts.csv")
parts = pd.read_csv("../data/raw/parts.csv")

freq_sets = {}
for e in inv_parts.part_num.tolist():
    if e in freq_sets:
        freq_sets[e] += 1
    else:
        freq_sets.update({e: 1})

# todo: convert to df, add column with sum of quantity for each part_num 

In [None]:
## visualize rise and cease of lego parts over time in a density plot each part_num
# note that processing takes forever (45 min)

# load required datasets from csv
inv_parts = pd.read_csv("../data/raw/inventory_parts.csv")
inv_fig = pd.read_csv("../data/raw/inventory_minifigs.csv")
parts = pd.read_csv("../data/raw/parts.csv")
sets = pd.read_csv("../data/raw/sets.csv")
inventories = pd.read_csv("../data/raw/inventories.csv")
relation = pd.read_csv("../data/raw/part_relationships.csv")

#begin logging steps and time
log = []
log.append([0, "begin processing", datetime.now().strftime("%H:%M:%S")])
print("begin processing, time: ", datetime.now().strftime("%H:%M:%S"))

# replace child_part_num with parent_part_num in inventories df
invParent = inv_parts.copy()
childToParent = pd.Series(relation.parent_part_num.values, index=relation.child_part_num).to_dict()
invParent['part_num'].replace(childToParent, inplace=True)

log.append([0, "after childToParent", datetime.now().strftime("%H:%M:%S")])
print("after childToParent, time: ", datetime.now().strftime("%H:%M:%S"))

# eliminate all part_num containing stickers, non-lego (eg booklets), printed or pattern or mold or alternate versions st only parents remain in partsClean.part_num
partsClean = parts.copy()
childPartNumToId = []
for p in relation.child_part_num:
    childPartNumToId.append(partsClean[partsClean['part_num'] == p].index.values[0])
partsClean.drop(childPartNumToId, inplace=True)
partsClean.drop(partsClean[partsClean['part_cat_id'] == 58].index, inplace=True) # drop Stickers
partsClean.drop(partsClean[partsClean['part_cat_id'] == 17].index, inplace=True) # drop Non-Lego (eg booklets)
partsClean.drop(partsClean[partsClean['part_cat_id'] == 4].index, inplace=True) # drop Duplo, Quatro and Primo
partsClean.drop(partsClean[partsClean['part_cat_id'] == 43].index, inplace=True) # drop Znap
partsClean.drop(partsClean[partsClean['part_cat_id'] == 48].index, inplace=True) # drop Clikits
partsClean.drop(partsClean[partsClean['part_cat_id'] == 57].index, inplace=True) # drop Non-Buildable Figures 
partsClean.drop(partsClean[partsClean['part_cat_id'] == 42].index, inplace=True) # drop Belville, Scala, Fabuland
#partsClean.drop(partsClean[partsClean['part_cat_id'] == 41].index, inplace=True) # drop Bionicle, Hero factory, Constraction

log.append([0, "after partsClean", datetime.now().strftime("%H:%M:%S")])
print("after partsClean, time: ", datetime.now().strftime("%H:%M:%S"))

# now replace remaining "<1>pr<2>" with "<1>" and drop duplicated "<1>" entries to eliminate remaining prints, necessary because they were not dropped above due to no relationship by missing parent (so we make one up as "<1>")
partsToRemove = {}
for p in partsClean.part_num:
    if re.match(r"(\S+)(pr)", p) or re.match(r"(\S+)(pat)", p) or re.match(r"(\S+)(c)([0-9]+)", p) or re.match(r"(\S+)(pb)", p) or re.match(r"(\S+)(px)", p) or re.match(r"(\S+)(e)([0-9]+)", p) or re.match(r"(\S+)(d)([0-9]+)", p) or re.match(r"(\S+)(x)([0-9]+)", p):
        pSub = re.sub("(\S+)(pat)(\S+)", lambda pat: pat.group(1), p)
        pSub = re.sub("(\S+)(pr)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(pb)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(px)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(c)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(x)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(d)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(e)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(kc)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(973)(p)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(970)(c)", lambda pat: pat.group(1), pSub)
        partsToRemove.update({p : pSub})

partsClean['part_num'].replace(partsToRemove, inplace=True)
partsClean.drop_duplicates(subset='part_num', keep='first', inplace=True)

invParent['part_num'].replace(partsToRemove, inplace=True)

log.append([0, "after partsToRemove", datetime.now().strftime("%H:%M:%S")])
print("after partsToRemove, time: ", datetime.now().strftime("%H:%M:%S"))

# map the sets which contain part_num to the according year
partsOverTime_sets = pd.DataFrame(columns=['part_num','name']+list(range(1949,2021)))
# loop over all part_num p in inv_parts.part_num to fill years of set occurences in df partsOverTime_sets 
for p in partsClean.part_num:
    # idk if good or bad coding style, but it makes me feel safe and helps debugging
    partToId = 0
    idToSet = 0
    setToYear = 0
    figToId = 0
    removeFig = []
    # init part dict with part_num, name and years from 1949 to 2020
    part = {'part_num' : p}
    part.update({'name' : partsClean[partsClean['part_num'] == p].iloc[0,1]})
    for y in list(range(1949, 2021)):
        part.update({y: 0})
    # part to inventory id (list)
    partToId = invParent[invParent['part_num'] == p].inventory_id.to_list()
    # remove duplicates from list, since these are parts occuring multiple times due different colors
    partToId = list(dict.fromkeys(partToId)) 
    # inventory id to set num
    idToSet = [inventories[inventories['id'] == e].iloc[0,2] for e in partToId]
    # in case of figures (set_num = fig-...) we need to go to inv_fig to get inv_id (list) -> inventories to get set_num(s) 
    for f in idToSet:
        if re.match(r"fig-[0-9]+", f):
            figToId = inv_fig[inv_fig['fig_num'] == f].inventory_id.to_list()
            removeFig.append(f)
            idToSet.extend([inventories[inventories['id'] == e].iloc[0,2] for e in figToId])
    idToSet = [e for e in idToSet if e not in removeFig]
    # set num to year
    setToYear = [sets[sets['set_num'] == e].iloc[0,2] for e in idToSet]
    # update dict of part with occurences per year
    for e in setToYear:
            part[e] += 1
    # finally append dict part as row to df partsOverTime_sets
    partsOverTime_sets = partsOverTime_sets.append(part, ignore_index=True)

    log.append([p, "after fill years in partsOverTime_sets", datetime.now().strftime("%H:%M:%S")])
    print(p, ", after fill years in partsOverTime_sets, time: ", datetime.now().strftime("%H:%M:%S"))

# create csv with sets(years) series: part_num, name, year_<min>, ..., year_<max> <- which contain the quantity of sets with part_num in this year
# note that min(year) = 1949 and max(year) = 2020 but only until may
partsOverTime_sets.to_csv("../data/processed/partsOverTime.csv", index = False)

log.append([0, "after write csv", datetime.now().strftime("%H:%M:%S")])
print("after write csv, time: ", datetime.now().strftime("%H:%M:%S"))

## maybe later:
# sum up sets / occurences across all years
# map part quantity to the accoring year, note that diffrent colors shall count as same part
# create csv with quantity(years) series: part_num, name, year_<min>, ..., year_<max> <- which contain the quantity overall for part_num in this year

In [None]:
partsClean.part_num.to_csv("../data/processed/partsClean.part_num.csv", index = False)

In [None]:
# load required datasets from csv
inv_parts = pd.read_csv("../data/raw/inventory_parts.csv")
inv_fig = pd.read_csv("../data/raw/inventory_minifigs.csv")
parts = pd.read_csv("../data/raw/parts.csv")
sets = pd.read_csv("../data/raw/sets.csv")
inventories = pd.read_csv("../data/raw/inventories.csv")
relation = pd.read_csv("../data/raw/part_relationships.csv")

#begin logging steps and time
print("begin processing, time: ", datetime.now().strftime("%H:%M:%S"))

# replace child_part_num with parent_part_num in inventories df
invParent = inv_parts.copy()

# eliminate all part_num containing stickers, non-lego (eg booklets), printed or pattern or mold or alternate versions st only parents remain in partsClean.part_num
partsClean = parts.copy()

# now replace remaining "<1>pr<2>" with "<1>" and drop duplicated "<1>" entries to eliminate remaining prints, necessary because they were not dropped above due to no relationship by missing parent (so we make one up as "<1>")
partsToRemove = {}
for p in partsClean.part_num:
    if re.match(r"(\S+)(pr)", p) or re.match(r"(\S+)(pat)", p) or re.match(r"(\S+)(c)([0-9]+)", p) or re.match(r"(\S+)(pb)", p) or re.match(r"(\S+)(px)", p) or re.match(r"(\S+)(e)([0-9]+)", p) or re.match(r"(\S+)(d)([0-9]+)", p) or re.match(r"(\S+)(x)([0-9]+)", p):
        pSub = re.sub("(\S+)(pat)(\S+)", lambda pat: pat.group(1), p)
        pSub = re.sub("(\S+)(pr)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(pb)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(px)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(c)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(x)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(d)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(e)([0-9]+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(\S+)(kc)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(973)(p)(\S+)", lambda pat: pat.group(1), pSub)
        pSub = re.sub("(970)(c)", lambda pat: pat.group(1), pSub)
        partsToRemove.update({p : pSub})

partsClean['part_num'].replace(partsToRemove, inplace=True)
partsClean.drop_duplicates(subset='part_num', keep='first', inplace=True)

invParent['part_num'].replace(partsToRemove, inplace=True)

print("after partsToRemove, time: ", datetime.now().strftime("%H:%M:%S"))