# Cylinder Band dataset EDA

In [424]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

In [425]:
## settings constants and parameters
Data_Folder = 'data'
Result_Folder = 'result'

In [426]:
columns = ['timestamp', ' cylinder_number', 'customer', 'job_number',
       'grain_screened', 'ink_color', 'proof_on_ctd_ink', 'blade_mfg',
       'cylinder_division', 'paper_type', 'ink_type', 'direct_steam',
       'solvent_type', 'type_on_cylinder', 'press_type', 'press',
       'unit_number', 'cylinder_size', 'paper_mil_location', 'plating_tank',
       'proof_cut', 'viscosity', 'caliper', 'ink_temperature', 'humify',
       'roughness', 'blade_pressure', 'varnish_pct', 'press_speed', 'ink_pct',
       'solvent_pct', 'ESA_Voltage', 'ESA_Amperage', 'wax', 'hardener',
       'roller_durometer', 'current_density', 'anode_spac_ratio',
       'chrome_content', 'band_type']

## Loading data when "?" character considered as missing value and will be converted to null value
data = pd.read_csv(Data_Folder+"/bands.csv" , names=columns, na_values=["?"], )

## convert null values to '?' character
data.fillna('?',inplace=True)
data = data.astype(str)

In [427]:
## Data Preprocessing
print("Shape of database: Samples x Measurements : "+str(data.shape)+"\n")
Input_Key = data.columns[:-1]
Target_Key = data.columns[-1]
print("Input Measurement: "+str(Input_Key)+"\n")
print("Target Measurement: "+str(Target_Key)+"\n")

Shape of database: Samples x Measurements : (541, 40)

Input Measurement: Index(['timestamp', ' cylinder_number', 'customer', 'job_number',
       'grain_screened', 'ink_color', 'proof_on_ctd_ink', 'blade_mfg',
       'cylinder_division', 'paper_type', 'ink_type', 'direct_steam',
       'solvent_type', 'type_on_cylinder', 'press_type', 'press',
       'unit_number', 'cylinder_size', 'paper_mil_location', 'plating_tank',
       'proof_cut', 'viscosity', 'caliper', 'ink_temperature', 'humify',
       'roughness', 'blade_pressure', 'varnish_pct', 'press_speed', 'ink_pct',
       'solvent_pct', 'ESA_Voltage', 'ESA_Amperage', 'wax', 'hardener',
       'roller_durometer', 'current_density', 'anode_spac_ratio',
       'chrome_content'],
      dtype='object')

Target Measurement: band_type



## Creating a missing value map csv file

In [428]:
DTF = data == '?'
ncol_count = [DTF[c].sum() for c in DTF.columns]

## Adding missing value count of 
data2 = pd.DataFrame(np.array([ncol_count]), columns=data.columns)
dataM = data.append(data2, ignore_index=True)


## Check the number of empty cells or '?' in a row, axis = 1 for row
dataM['missing_cells_in_row'] = dataM.iloc[ : , : ].isnull().sum(axis=1) + dataM.iloc[:, :].eq("?").sum(axis=1)
## Checking the empty value of target column, axis = 0 for column
Missing_Value_in_Target_Column = dataM[Target_Key].isnull().sum(axis=0) + dataM[Target_Key].eq("?").sum(axis=0)

print("Missing Value of Target Label Column = "+str(Missing_Value_in_Target_Column))

## 542
import os
try:
    os.remove("missing_value_count_xy.csv")
except:
    pass
    
dataM.to_csv("missing_value_count_xy.csv",index=False)

Missing Value of Target Label Column = 2


## **Removing the samples which has no Target label. We can not assign a random value to a binary catagory so, we are droping it

In [429]:
data = data[data[Target_Key] != '?']
print(data.shape)

(539, 40)


## ** Dataset Type:
### Measurement types: Mix (Numeric(Continuous) + Catagorical)
### Problem Innersense: As some conditional rules of cylinder banding was discovered by Human team, keeping track on all those conditions is hard.
### Hypothesis of Model: 1st Estimate: Decision Tree

## ** Attribute Information:
### 1. timestamp: numeric;19500101 - 21001231 
### 2. cylinder number: nominal 
### 3. customer: nominal; 
### 4. job number: nominal; 
### 5. grain screened: nominal; yes, no 
### 6. ink color: nominal; key, type 
### 7. proof on ctd ink: nominal; yes, no 
### 8. blade mfg: nominal; benton, daetwyler, uddeholm 
### 9. cylinder division: nominal; gallatin, warsaw, mattoon 
### 10. paper type: nominal; uncoated, coated, super 
### 11. ink type: nominal; uncoated, coated, cover 
### 12. direct steam: nominal; use; yes, no * 
### 13. solvent type: nominal; xylol, lactol, naptha, line, other 
### 14. type on cylinder: nominal; yes, no 
### 15. press type: nominal; use; 70 wood hoe, 70 motter, 70 albert, 94 motter 
### 16. press: nominal; 821, 802, 813, 824, 815, 816, 827, 828 
### 17. unit number: nominal; 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 
### 18. cylinder size: nominal; catalog, spiegel, tabloid 
### 19. paper mill location: nominal; north us, south us, canadian, scandanavian, mid european 
### 20. plating tank: nominal; 1910, 1911, other 
### 21. proof cut: numeric; 0-100 
### 22. viscosity: numeric; 0-100 
### 23. caliper: numeric; 0-1.0 
### 24. ink temperature: numeric; 5-30 
### 25. humifity: numeric; 5-120 
### 26. roughness: numeric; 0-2 
### 27. blade pressure: numeric; 10-75 
### 28. varnish pct: numeric; 0-100 
### 29. press speed: numeric; 0-4000 
### 30. ink pct: numeric; 0-100 
### 31. solvent pct: numeric; 0-100 
### 32. ESA Voltage: numeric; 0-16 
### 33. ESA Amperage: numeric; 0-10 
### 34. wax: numeric ; 0-4.0 
### 35. hardener: numeric; 0-3.0 
### 36. roller durometer: numeric; 15-120 
### 37. current density: numeric; 20-50 
### 38. anode space ratio: numeric; 70-130 
### 39. chrome content: numeric; 80-120 
### 40. band type: nominal; class; band, no band *