In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'cylinder_bands'

In [3]:
input_dir = './raw/'
inp_fname = 'cylinder_bands.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,timestamp,cylinder_number,customer,job_number,grain_screened,ink_color,proof_on_ctd_ink,blade_mfg,cylinder_division,paper_type,...,solvent_pct,ESA_Voltage,ESA_Amperage,wax,hardener,roller_durometer,current_density,anode_space_ratio,chrome_content,band_type
0,b'19910108',b'x126',b'tvguide',25503.0,b'yes',b'key',b'yes',b'benton',b'gallatin',b'uncoated',...,36.4,0.0,0.0,2.5,1.0,34.0,b'40',105.0,b'100',b'band'
1,b'19910109',b'x266',b'tvguide',25503.0,b'yes',b'key',b'yes',b'benton',b'gallatin',b'uncoated',...,38.5,0.0,0.0,2.5,0.7,34.0,b'40',105.0,b'100',b'noband'
2,b'19910104',b'b7',b'modmat',47201.0,b'yes',b'key',b'yes',b'benton',b'gallatin',b'uncoated',...,39.8,0.0,0.0,2.8,0.9,40.0,b'40',103.87,b'100',b'noband'
3,b'19910104',b't133',b'massey',39039.0,b'yes',b'key',b'yes',b'benton',b'gallatin',b'uncoated',...,38.8,0.0,0.0,2.5,1.3,40.0,b'40',108.06,b'100',b'noband'
4,b'19910111',b'j34',b'kmart',37351.0,b'no',b'key',b'yes',b'benton',b'gallatin',b'uncoated',...,42.5,5.0,0.0,2.3,0.6,35.0,b'40',106.67,b'100',b'noband'


In [5]:
id_col = "id"
target_col = "band_type"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id    timestamp cylinder_number    customer  job_number grain_screened  \
0   0  b'19910108'         b'x126'  b'tvguide'     25503.0         b'yes'   
1   1  b'19910109'         b'x266'  b'tvguide'     25503.0         b'yes'   
2   2  b'19910104'           b'b7'   b'modmat'     47201.0         b'yes'   
3   3  b'19910104'         b't133'   b'massey'     39039.0         b'yes'   
4   4  b'19910111'          b'j34'    b'kmart'     37351.0          b'no'   

  ink_color proof_on_ctd_ink  blade_mfg cylinder_division  ... solvent_pct  \
0    b'key'           b'yes'  b'benton'       b'gallatin'  ...        36.4   
1    b'key'           b'yes'  b'benton'       b'gallatin'  ...        38.5   
2    b'key'           b'yes'  b'benton'       b'gallatin'  ...        39.8   
3    b'key'           b'yes'  b'benton'       b'gallatin'  ...        38.8   
4    b'key'           b'yes'  b'benton'       b'gallatin'  ...        42.5   

  ESA_Voltage ESA_Amperage  wax hardener roller_durometer current_de

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['timestamp',
 'cylinder_number',
 'customer',
 'grain_screened',
 'ink_color',
 'proof_on_ctd_ink',
 'blade_mfg',
 'cylinder_division',
 'paper_type',
 'ink_type',
 'direct_steam',
 'solvent_type',
 'type_on_cylinder',
 'press_type',
 'press',
 'cylinder_size',
 'paper_mill_location',
 'plating_tank',
 'caliper',
 'current_density',
 'chrome_content',
 'band_type']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,timestamp,cylinder_number,customer,job_number,grain_screened,ink_color,proof_on_ctd_ink,blade_mfg,cylinder_division,...,solvent_pct,ESA_Voltage,ESA_Amperage,wax,hardener,roller_durometer,current_density,anode_space_ratio,chrome_content,band_type
0,0,19910108,x126,tvguide,25503.0,yes,key,yes,benton,gallatin,...,36.4,0.0,0.0,2.5,1.0,34.0,40,105.0,100,band
1,1,19910109,x266,tvguide,25503.0,yes,key,yes,benton,gallatin,...,38.5,0.0,0.0,2.5,0.7,34.0,40,105.0,100,noband
2,2,19910104,b7,modmat,47201.0,yes,key,yes,benton,gallatin,...,39.8,0.0,0.0,2.8,0.9,40.0,40,103.87,100,noband
3,3,19910104,t133,massey,39039.0,yes,key,yes,benton,gallatin,...,38.8,0.0,0.0,2.5,1.3,40.0,40,108.06,100,noband
4,4,19910111,j34,kmart,37351.0,no,key,yes,benton,gallatin,...,42.5,5.0,0.0,2.3,0.6,35.0,40,106.67,100,noband


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

['ink_color', 'cylinder_division']


## Save Main Data File

In [13]:
data.to_csv(outp_fname, index=False)