# MLP2

- __Data Preprocessing__

In [62]:

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [63]:
# Load data
auto_imports = pd.read_csv("../rawdata/auto_imports.csv", header=None)
print(auto_imports.head())
print(auto_imports.tail())

   0    1            2    3    4     5            6    7      8     9   ...  \
0   3    ?  alfa-romero  gas  std   two  convertible  rwd  front  88.6  ...   
1   3    ?  alfa-romero  gas  std   two  convertible  rwd  front  88.6  ...   
2   1    ?  alfa-romero  gas  std   two    hatchback  rwd  front  94.5  ...   
3   2  164         audi  gas  std  four        sedan  fwd  front  99.8  ...   
4   2  164         audi  gas  std  four        sedan  4wd  front  99.4  ...   

    16    17    18    19    20   21    22  23  24     25  
0  130  mpfi  3.47  2.68   9.0  111  5000  21  27  13495  
1  130  mpfi  3.47  2.68   9.0  111  5000  21  27  16500  
2  152  mpfi  2.68  3.47   9.0  154  5000  19  26  16500  
3  109  mpfi  3.19  3.40  10.0  102  5500  24  30  13950  
4  136  mpfi  3.19  3.40   8.0  115  5500  18  22  17450  

[5 rows x 26 columns]
     0   1      2       3      4     5      6    7      8      9   ...   16  \
196  -1  95  volvo     gas    std  four  sedan  rwd  front  109.1  ..

In [64]:
# Handling missing data with ? value
auto_imports[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]] = auto_imports[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]].replace('?', np.NaN)

# Total number of NaN values in each column
print(auto_imports.isnull().sum())

0      0
1     37
2      0
3      0
4      0
5      2
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     4
19     4
20     0
21     2
22     2
23     0
24     0
25     0
dtype: int64


In [65]:
# Handling non-numerical values - columns 2,14 (make and engine-type)
auto_imports_strings = auto_imports.copy()
auto_imports_strings = auto_imports_strings[[2,3,4,5,6,7,8,14,15,17]]
auto_imports_strings = auto_imports_strings.apply(lambda x: pd.factorize(x)[0]+1)

In [66]:
# Temporarily dropping columns with scaled categorical numeric values to normalize data
temp_drop_cols = [2,3,4,5,6,7,8,14,15,17]
auto_imports.drop(auto_imports.columns[temp_drop_cols],axis=1,inplace=True)

# Fill missing values with mean column values
values = auto_imports.values
imputer = SimpleImputer()
transformed_values = imputer.fit_transform(values)

# Count the number of NaN values in each column
print(np.isnan(transformed_values).sum())

0


In [67]:
# Make a new dataframe for the transformed values
new_data = pd.DataFrame(transformed_values, columns = ["symboling", "normalized-losses","wheel-base", "length", "width", "height", "curb-weight", "engine-size", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg","price"])

In [68]:
# Add back numerical values of categorical types
new_data["make"] = auto_imports_strings[2]
new_data["fuel-type"] = auto_imports_strings[3]
new_data["aspiration"] = auto_imports_strings[4]
new_data["num-of-doors"] = auto_imports_strings[5]
new_data["body-style"] = auto_imports_strings[6]
new_data["drive-wheels"] = auto_imports_strings[7]
new_data["engine-location"] = auto_imports_strings[8]
new_data["engine-type"] = auto_imports_strings[14]
new_data["num-of-cylinders"] = auto_imports_strings[15]
new_data["fuel-system"] = auto_imports_strings[17]


print(new_data.head())

   symboling  normalized-losses  wheel-base  length  width  height  \
0        3.0              122.0        88.6   168.8   64.1    48.8   
1        3.0              122.0        88.6   168.8   64.1    48.8   
2        1.0              122.0        94.5   171.2   65.5    52.4   
3        2.0              164.0        99.8   176.6   66.2    54.3   
4        2.0              164.0        99.4   176.6   66.4    54.3   

   curb-weight  engine-size  bore  stroke  ...  make  fuel-type  aspiration  \
0       2548.0        130.0  3.47    2.68  ...     1          1           1   
1       2548.0        130.0  3.47    2.68  ...     1          1           1   
2       2823.0        152.0  2.68    3.47  ...     1          1           1   
3       2337.0        109.0  3.19    3.40  ...     2          1           1   
4       2824.0        136.0  3.19    3.40  ...     2          1           1   

   num-of-doors  body-style  drive-wheels  engine-location  engine-type  \
0             1           1  

In [69]:
# Convert the new dataframe to csv and stored in input folder
new_data.to_csv("../input/processed_data.csv", index=False)