In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

%matplotlib inline

### Read file

In [2]:
DATA = pd.read_csv('./data/imports-85.data.csv')

### Remove Symboling column

In [3]:
print('Before removing, nums of column : %d' % DATA.shape[1])
DATA = DATA[DATA.columns[1:]]
print('After removing, nums of column : %d' % DATA.shape[1])

Before removing, nums of column : 26
After removing, nums of column : 25


### Check data types

In [4]:
DATA.dtypes

normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mps            int64
price                 object
dtype: object

### Remove quotation mark('?') in normalized-losses column

In [5]:
print('Total nums of rows that have quotation mark : %d' % DATA[DATA['normalized-losses'] == '?'].shape[0])

NORMALIZED_LOSSES_QUOTATION_DELETE = True if DATA[DATA['normalized-losses'] == '?'].shape[0] != 0 else False

if NORMALIZED_LOSSES_QUOTATION_DELETE:
    print('Before removing : %d' % DATA.shape[0])
    ### Remove rows that have '?' in normalized-losses column
    for idx in range(DATA.shape[0]):
        if DATA.loc[idx]['normalized-losses'] == '?':
            DATA = DATA.drop(idx)

    DATA.reset_index(drop=True, inplace=True)
    print('After removing %d' % DATA.shape[0])

Total nums of rows that have quotation mark : 41
Before removing : 205
After removing 164


### Check columns have quotation mark values

In [6]:
print([col for col in DATA.columns if '?' in DATA[col].values])

['num-of-doors', 'bore', 'stroke']


  if __name__ == '__main__':


### Change num-of-doors missing value : ? -> four 

https://en.wikipedia.org/wiki/Dodge_Colt 

In above reference, Dodge colt fifth generation car is similar to missing sample. 

This car is in 1984 and wheelbase, width, length, height size is similar.

Also, engine specifications are similar too.

Therefore, i change num-of-doors value from ? to four. Because, num of doors of Dodge colt 5th' sedan is four.

In [7]:
DATA[DATA['num-of-doors'] == '?']

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mps,price
17,148,dodge,gas,turbo,?,sedan,fwd,front,93.7,157.3,63.8,50.6,2191,ohc,four,98,mpfi,3.03,3.39,7.6,102,5500,24,30,8558


In [8]:
# Because, sedan car of dodge num-of-doors is four.
DATA.loc[DATA['num-of-doors'] == '?', 'num-of-doors'] = 'four'

### Remove bore, stroke missing value 

https://en.wikipedia.org/wiki/Mazda_RX-7
http://timrizal.blogspot.kr/2008/07/mazda-rotary-engine.html

Rows that have '?' is maybe mazda' rx-7 SA22c model.

Because, car specification is very similar and SA22c model have turbo charger in future.

So, i think that three rows is same car. and last one is rx7_SA22c_turbo charger model.

However, rotary engine don't have bore and stroke. and In this dataset, rows have engine type is 

rotor(rotary engine) is mazda cars

these are bore & stroke missing values. So, i choose removing these samples.


In [9]:
DATA = DATA.drop(DATA[(DATA['make'] == 'mazda') & (DATA['engine-type'] == 'rotor')].index)
DATA.reset_index(drop=True, inplace=True)

### Switching columns

In [10]:
DATA = DATA[DATA.columns[1:].append(DATA.columns[0:1])]

### Change Data Types

In [11]:
DATA.dtypes

make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mps            int64
price                 object
normalized-losses     object
dtype: object

* normalized-loss -> float64
* bore -> float64
* stroke -> float64
* horsepower -> float64
* peak-rpm -> float64
* price -> float64

In [12]:
DATA['normalized-losses'] = DATA['normalized-losses'].astype(np.float64)
DATA['bore'] = DATA['bore'].astype(np.float64)
DATA['stroke'] = DATA['stroke'].astype(np.float64)
DATA['horsepower'] = DATA['horsepower'].astype(np.float64)
DATA['peak-rpm'] = DATA['peak-rpm'].astype(np.float64)
DATA['price'] = DATA['price'].astype(np.float64)

In [13]:
DATA.dtypes

make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mps            int64
price                float64
normalized-losses    float64
dtype: object

### Save preprocessed data

I think that it's 

In [14]:
DATA.to_csv('./data/preprocessed_automobile_data.csv', index=False)