# Data Preprocessing and Exploration

### Import necessary modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

### Processing component information

In [2]:
# Componenets

# adaptor
comp_adaptor = pd.read_csv('input/comp_adaptor.csv')
comp_adaptor['type'] = 'adaptor'

# boss
comp_boss = pd.read_csv('input/comp_boss.csv')
comp_boss['type'] = 'boss'

# comp_boss.rename(columns={'height_over_tube':'Overall_length'}, inplace=True)

# elbw
comp_elbow = pd.read_csv('input/comp_elbow.csv')
comp_elbow['type'] = 'elbow'


# float
comp_float = pd.read_csv('input/comp_float.csv')
comp_float['type'] = 'float'

# hf1
comp_hfl = pd.read_csv('input/comp_hfl.csv')
comp_hfl.rename(columns={'hose_diameter':'diameter'}, inplace=True)
comp_hfl['type'] = 'hfl'

# nut is standardize parts with low cost
# I will only care about the number and weight of nuts in a tube
comp_nut = pd.read_csv('input/comp_nut.csv')
comp_nut.rename(columns={'length':'overall_length'}, inplace=True)
nut_id = comp_nut['component_id']
comp_nut['type'] = 'nut'

# sleeve component
comp_sleeve = pd.read_csv('input/comp_sleeve.csv')
comp_sleeve.rename(columns={'connection_type_id':'connection_type_id_1'}, inplace=True)
comp_sleeve.rename(columns={'intended_nut_thread':'thread_size'}, inplace=True)
comp_sleeve.rename(columns={'intended_nut_pitch':'thread_pitch'}, inplace=True)
comp_sleeve.rename(columns={'length':'overall_length'}, inplace=True)
comp_sleeve['type'] = 'sleeve'

# straight component
comp_straight = pd.read_csv('input/comp_straight.csv')
comp_straight['type'] = 'straight'

# tee component
comp_tee = pd.read_csv('input/comp_tee.csv')
comp_tee['type'] = 'tee'

# threaded component
comp_threaded = pd.read_csv('input/comp_threaded.csv')
comp_threaded['type'] = 'thread'

# other component
comp_other = pd.read_csv('input/comp_other.csv')
comp_other.rename(columns={'part_name':'type'}, inplace=True)
comp_other['type'] = comp_other['type'].str.lower()


#combine the component type with similar names
component_type_keywords = ['adapter', 'fitting', 'nut', 'tee', 'block', 
                          'plug', 'flange', 'elbow', 'tube', 'coupling', 'bracket', 
                           'connector', 'plate', 'orifice', 'boss', 
                          'sleeve', 'pin', 'screen', 'bolt', 'cap', 'hook', 'joint', 
                           'rivet', 'hasp', 'filler', 'washer','valve', 'hose', 
                           'clamp', 'angle', 'bellow', 'cover', 'body']

comp_other.loc[comp_other['type'] ==  'adapter', 'type'] = 'adaptor'
comp_other.loc[comp_other['type'] ==  'pipe', 'type'] = 'tube'

for key_word in component_type_keywords:
    comp_other.loc[comp_other['type'].str.contains(key_word), 'type'] = key_word


component_list = [comp_adaptor, comp_boss, comp_elbow, comp_float, 
                  comp_hfl, comp_nut, comp_other, comp_sleeve, 
                  comp_straight, comp_tee, comp_threaded]

# combine all components togther
components = pd.concat(component_list, ignore_index = True)
component_type_count = components['type'].value_counts()
other_type = component_type_count[component_type_count < 5].index.values.astype('str')
components.loc[components['type'].astype('str').isin(other_type), 'type'] = 'other'




In [3]:
# some feature engineering for components
# I added the thread size togther to get a total thread size

# for some thread size like M10 is ISO unit not inch
# need to convert to english unit
inch_to_mm = 25.4
is_ISO_thread = components['thread_size'].astype('str').str[0] == 'M'
components.loc[is_ISO_thread, 'thread_size']= components['thread_size'].astype('str').str[1:]
components['thread_size'] = components['thread_size'].astype('float64')
components.loc[is_ISO_thread, 'thread_size'] = components.loc[is_ISO_thread, 'thread_size'] / inch_to_mm

# process some error data
components.loc[components['nominal_size_1'] == 'See Drawing', 'nominal_size_1'] = np.nan
components['nominal_size_1'] = components['nominal_size_1'].astype('float64')
# prcocessing data
components[components == 9999] = np.nan

thread_feature = ['thread_pitch', 'thread_pitch_1', 'thread_pitch_2', 'thread_pitch_3', 'thread_pitch_4',
                  'thread_size', 'thread_size_1',  'thread_size_2',  'thread_size_3',  'thread_size_4', 
                  'nominal_size_1', 'nominal_size_2', 'nominal_size_3', 'nominal_size_4']

components.loc[:, thread_feature] =  components.loc[:, thread_feature].fillna(0)
components['thread_size_1'] = components.loc[:, ['thread_size_1', 'nominal_size_1']].max(skipna=False, axis=1)
components['thread_size_2'] = components.loc[:, ['thread_size_2', 'nominal_size_2']].max(skipna=False, axis=1)
components['thread_size_3'] = components.loc[:, ['thread_size_3', 'nominal_size_3']].max(skipna=False, axis=1)
components['thread_size_4'] = components.loc[:, ['thread_size_4', 'nominal_size_4']].max(skipna=False, axis=1)

components['thread'] = np.nansum(components.loc[:, ['thread_size', 'thread_size_1',  
                                             'thread_size_2',  'thread_size_3',  'thread_size_4']], axis=1)

components = components.drop(thread_feature, axis = 1)

length_feature = ['length_1', 'length_2', 'length_3',  'length_4']
components['length'] = np.nansum(components.loc[:, length_feature], axis=1)
components = components.drop(length_feature, axis = 1)



# drop the feature with less than 10 records
# components = components.drop(['connection_type_id_4', 'end_form_id_4', 'length_4'], axis=1)

# if a component has unique feature, it need exature manufacturing process to make those features
components.loc[components['unique_feature'] == 'Yes', 'unique_feature'] = 1
components.loc[components['unique_feature'] == 'No',  'unique_feature'] = 0
components['unique_feature'] = components['unique_feature'].fillna(0)
components['unique_feature'] = components['unique_feature'].astype('int16')

# groove
components.loc[components['groove'] == 'Yes', 'groove'] = 1
components.loc[components['groove'] == 'No',  'groove'] = 0
components['groove'] = components['groove'].fillna(0)
components['groove'] = components['groove'].astype('int16')

# orientation
components.loc[components['orientation'] == 'Yes', 'orientation'] = 1
components.loc[components['orientation'] == 'No',  'orientation'] = 0
components['orientation'] = components['orientation'].fillna(0)
components['orientation'] = components['orientation'].astype('int16')

object_feature = ((components.dtypes == object).values & 
                          np.logical_not(np.isin(components.columns.astype('str'), 
                        ['component_type_id', 'component_id', 'type', 'material'])))
 
# get the slice of object feature
# if not empty, assign 1, empty assign 0
comp_object = components.loc[:, object_feature].copy()
comp_object[comp_object.notnull()] = 1

components.loc[:, object_feature] = comp_object
components = components.fillna(0)

components['connection_type_id'] = components[['connection_type_id', 'connection_type_id_1', 
                                              'connection_type_id_2', 'connection_type_id_3',
                                              'connection_type_id_4']].sum(axis=1)

components['end_form_id'] = components[['end_form_id_1', 'end_form_id_2', 
                                       'end_form_id_3','end_form_id_4']].sum(axis=1)

components = components.drop(['connection_type_id_1', 'connection_type_id_2', 'connection_type_id_3',
                              'connection_type_id_4', 'end_form_id_1', 'end_form_id_2', 'end_form_id_3',
                              'end_form_id_4'], axis=1)

# add a dummy variable for future use
components['number'] = 1

In [4]:
components.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   component_id         2047 non-null   object 
 1   component_type_id    2047 non-null   object 
 2   adaptor_angle        2047 non-null   float64
 3   overall_length       2047 non-null   float64
 4   hex_size             2047 non-null   float64
 5   unique_feature       2047 non-null   int16  
 6   orientation          2047 non-null   int16  
 7   weight               2047 non-null   float64
 8   type                 2047 non-null   object 
 9   connection_type_id   2047 non-null   int64  
 10  outside_shape        2047 non-null   int64  
 11  base_type            2047 non-null   int64  
 12  height_over_tube     2047 non-null   float64
 13  bolt_pattern_long    2047 non-null   float64
 14  bolt_pattern_wide    2047 non-null   float64
 15  groove               2047 non-null   i

In [5]:
components.head()

Unnamed: 0,component_id,component_type_id,adaptor_angle,overall_length,hex_size,unique_feature,orientation,weight,type,connection_type_id,...,material,plating,hex_nut_size,seat_angle,blind_hole,head_diameter,thread,length,end_form_id,number
0,C-0005,CP-028,0.0,58.4,34.93,0,0,0.206,adaptor,2,...,0,0,0.0,0.0,0,0.0,2.312,0.0,2,1
1,C-0006,CP-028,0.0,34.8,22.2,0,0,0.083,adaptor,2,...,0,0,0.0,0.0,0,0.0,1.187,0.0,2,1
2,C-1435,CP-028,0.0,20.3,22.22,0,0,0.023,adaptor,2,...,0,0,0.0,0.0,0,0.0,16.755,0.0,2,1
3,C-1546,CP-028,0.0,26.4,15.88,0,0,0.026,adaptor,2,...,0,0,0.0,0.0,0,0.0,0.25,0.0,2,1
4,C-1583,CP-028,0.0,44.5,38.1,0,0,0.256,adaptor,2,...,0,0,0.0,0.0,0,0.0,2.374,0.0,2,1


The component list still has 46 features and very sparse.

In [6]:
components.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   component_id         2047 non-null   object 
 1   component_type_id    2047 non-null   object 
 2   adaptor_angle        2047 non-null   float64
 3   overall_length       2047 non-null   float64
 4   hex_size             2047 non-null   float64
 5   unique_feature       2047 non-null   int16  
 6   orientation          2047 non-null   int16  
 7   weight               2047 non-null   float64
 8   type                 2047 non-null   object 
 9   connection_type_id   2047 non-null   int64  
 10  outside_shape        2047 non-null   int64  
 11  base_type            2047 non-null   int64  
 12  height_over_tube     2047 non-null   float64
 13  bolt_pattern_long    2047 non-null   float64
 14  bolt_pattern_wide    2047 non-null   float64
 15  groove               2047 non-null   i

In [7]:
components.describe()

Unnamed: 0,adaptor_angle,overall_length,hex_size,unique_feature,orientation,weight,connection_type_id,outside_shape,base_type,height_over_tube,...,coupling_class,plating,hex_nut_size,seat_angle,blind_hole,head_diameter,thread,length,end_form_id,number
count,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,...,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0,2047.0
mean,2.39619,11.713711,1.921847,0.089399,0.338544,0.747973,0.221788,0.060576,0.060576,1.452003,...,0.002931,0.027357,0.611646,0.282853,0.011236,1.997548,2.294788,3.022062,0.223742,1.0
std,14.457807,26.687619,7.943596,0.285389,0.47333,1.353164,0.460199,0.23861,0.23861,5.720714,...,0.054074,0.163162,4.531138,3.304314,0.105428,11.108337,7.628894,16.337709,0.654157,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,1.0,0.843,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,90.0,190.55,76.2,1.0,1.0,13.77,3.0,1.0,1.0,54.7,...,1.0,1.0,57.15,45.0,1.0,127.0,84.0,155.9,4.0,1.0


### Merge bom and components

In [8]:
# load bom data
bom = pd.read_csv('input/bill_of_materials.csv', na_values = 9999)


# get the component type list
comp_type_list = components['type'].unique()

# get the component feature list
comp_feature_list = components.columns.values.astype('str')
delete_feature = ['component_type_id', 'component_id', 'type', 'material']
comp_feature_list = np.delete(comp_feature_list, 
                              np.argwhere(np.isin(comp_feature_list,  delete_feature)))


# create all the combination of type and feature
type_feature_list = [x+'_'+y for x in comp_type_list for y in comp_feature_list]

bom_comp = pd.DataFrame(columns = type_feature_list)
bom_comp['tube_assembly_id'] = bom['tube_assembly_id']
# number_feature = [x+'_number' for x in comp_type_list]
# bom_comp[number_feature] = 0

quantity_feature = ['quantity_' + str(x) for x in range(1,9)]
bom_comp['total_comp_number'] = bom[quantity_feature].sum(axis = 1)

In [None]:
for idx, row in bom_comp.iterrows():
    if (idx % 1000 == 0):
        print('Processing {}/{}'.format(idx, len(bom_comp)))
    for i in range(1,9):
        comp_id_x = 'component_id_' + str(i)
        comp_x_quantity = 'quantity_' + str(i)
        comp_id = bom.loc[idx, comp_id_x]
        if not pd.isnull(comp_id):
            comp_type = components.loc[components['component_id'] == comp_id, 'type'].values[0]
            comp_quantity = bom.loc[idx, comp_x_quantity]
            bom_comp_feature = [comp_type + '_' + x for x in comp_feature_list]
            bom_comp.loc[idx, bom_comp_feature] = 0
            bom_comp.loc[idx, bom_comp_feature] +=  components.loc[
                components['component_id'] == comp_id, 
                comp_feature_list].values.squeeze()* comp_quantity
# fill na
# bom_comp = bom_comp.fillna(0)

# total/min/max/mean weight of components for each assembly
weight_feature = [x + '_weight' for x in comp_type_list]
bom_comp['total_weight'] = bom_comp[weight_feature].sum(axis = 1)
bom_comp['min_weight'] = bom_comp[weight_feature].min(axis = 1)
bom_comp['max_weight'] = bom_comp[weight_feature].max(axis = 1)
bom_comp['mean_weight'] = bom_comp[weight_feature].mean(axis = 1)


# drop na
bom_comp = bom_comp.dropna(axis=1, how='all')
bom_comp = bom_comp.fillna(0)

# drop all zero columns
bom_comp = bom_comp.loc[:, (bom_comp != 0).any(axis=0)]

# save results
pickle.dump(bom_comp, open('bom_comp.pkl', 'wb'))

Processing 0/21198
Processing 1000/21198
Processing 2000/21198


In [None]:
bom_comp.info()

In [None]:
bom_comp.head()