# Bosch Production Line Performance data
## John Burt
### Portland Data Science Group<br/>Applied Data Science Meetup series

### Notebook purpose: Look at data files 

#### NOTE: This is the reduced data, which can be downloaded at [dive-into.info](http://dive-into.info/3936/bosch_small_data.zip)

The full dataset can be [downloaded from Kaggle](https://www.kaggle.com/c/bosch-production-line-performance)

**From the Kaggle data description:**

The data for this competition represents measurements of parts as they move through Bosch's production lines. Each part has a unique Id. The goal is to predict which parts will fail quality control (represented by a 'Response' = 1).

The dataset contains an extremely large number of anonymized features. Features are named according to a convention that tells you the production line, the station on the line, and a feature number. E.g. L3_S36_F3939 is a feature measured on line 3, station 36, and is feature number 3939.

On account of the large size of the dataset, we have separated the files by the type of feature they contain: numerical, categorical, and finally, a file with date features. The date features provide a timestamp for when each measurement was taken. Each date column ends in a number that corresponds to the previous feature number. E.g. the value of L0_S0_D1 is the time at which L0_S0_F0 was taken.


### Generic notebook imports

In [3]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

import pandas as pd
import numpy as np



## Read data files into pandas dataframes

Note: assumes the data files are located in folder "bosch_small_data"

In [4]:
location = './bosch_small_data/'
datefile = 'train_date.csv'
catfile = 'train_cat.csv'
numericfile = 'train_numeric.csv'

data_date=pd.read_csv(location+datefile) 
data_cat=pd.read_csv(location+catfile) 
data_numeric=pd.read_csv(location+numericfile) 


In [5]:
data_date.head()

Unnamed: 0,Id,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,...,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263,Response
0,23,,,,,,,,,,...,,,,,,,,,,0.0
1,71,1458.06,1458.06,1458.06,1458.06,1458.06,1458.06,1458.06,1458.06,1458.06,...,,,,,,,,,,0.0
2,76,,,,,,,,,,...,,,,,,,,,,0.0
3,86,922.9,922.9,922.9,922.9,922.9,922.9,922.9,922.9,922.9,...,,,,,,,,,,0.0
4,97,,,,,,,,,,...,,,,,,,,,,0.0


In [6]:
data_cat.head()

Unnamed: 0,Id,L0_S1_F25,L0_S1_F27,L0_S1_F29,L0_S1_F31,L0_S2_F33,L0_S2_F35,L0_S2_F37,L0_S2_F39,L0_S2_F41,...,L3_S49_F4227,L3_S49_F4229,L3_S49_F4230,L3_S49_F4232,L3_S49_F4234,L3_S49_F4235,L3_S49_F4237,L3_S49_F4239,L3_S49_F4240,Response
0,23,,,,,,,,,,...,,,,,,,,,,0.0
1,71,,,,,,,,,,...,,,,,,,,,,0.0
2,76,,,,,,,,,,...,,,,,,,,,,0.0
3,86,,,,,,,,,,...,,,,,,,,,,0.0
4,97,,,,,,,,,,...,,,,,,,,,,0.0


In [7]:
data_numeric.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,23,,,,,,,,,,...,,,,,,,,,,0.0
1,71,-0.167,-0.168,0.276,0.33,0.074,0.161,0.052,0.248,0.163,...,,,,,,,,,,0.0
2,76,,,,,,,,,,...,,,,,,,,,,0.0
3,86,-0.003,0.041,-0.033,-0.016,0.074,0.161,0.0,-0.072,0.025,...,,,,,,,,,,0.0
4,97,,,,,,,,,,...,,,,,,,,,,0.0


## How many defect samples are in the training dataset?

In [8]:
print("total training samples = %d, # defect samples = %d, or %1.2f percent of all samples"%(
    data_numeric.shape[0],
    data_numeric.Response[data_numeric.Response>0].shape[0],
    100*data_numeric.Response[data_numeric.Response>0].shape[0]/data_numeric.shape[0]))


total training samples = 69742, # defect samples = 6189, or 8.87 percent of all samples


## What do the file contents look like?


In [14]:
f = open(location+datefile, 'r')
x = f.readlines(30000)
f.close()

x[1:3]

['23,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,156.27,,,,,,,,,,,,156.3,156.3,156.3,156.3,156.3,156.3,156.3,,,,,,156.3,156.3,156.3,156.3,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,156.31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

In [15]:
f = open(location+catfile, 'r')
x = f.readlines(30000)
f.close()

x[1:3]

['23,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

In [16]:
f = open(location+numericfile, 'r')
x = f.readlines(30000)
f.close()

x[1:3]

['23,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.008,-0.003,-0.065,-0.136,-0.053,-0.154,-0.006,-0.040999999999999995,-0.049,-0.059000000000000004,-0.281,-0.281,-0.03,-0.23399999999999999,-0.067,0.445,-0.034,0.138,-0.001,0.0,-0.069,0.242,-0.003,,,,,,,,,,,,0.114,-0.253,-0.01,-0.083,-0.03,,,,-0.19699999999999998,0.0,0.0,-0.017,0.004,-0.022000000000000002,0.09,0.013000000000000001,0.009000000000000001,0.053,0.053,-0.022000000000000002,-0.003,0.005,0.036000000000000004,0.004,-0.001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,