In [3]:
import os
# for nice dark theme in Jupyter Notebooks
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
import warnings
warnings.filterwarnings('once')

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Get intermediate cleaned data
data_interim = '../data/interim/'
train_data = 'df_train_cleaned.csv'
df_train = pd.read_csv(os.path.join(data_interim, train_data))
X_train = df_train.drop('mpg', axis=1)
y_train = df_train['mpg']

In [11]:
# Continous features
continuous_features = ['displacement',
                       'horsepower',
                       'weight',
                       'acceleration',]
# Categorical features
ordinal_features = ['cylinders',
                    'year',]

nominal_features = ['region']


# Build transformation pipelines adapted to feature types
cont_pipeline = Pipeline([
    ('imputer_cont', SimpleImputer(strategy='median')),
    ('std_scaler_cont', StandardScaler()),
])

ord_pipeline = Pipeline([
    ('imputer_ord', SimpleImputer(strategy='most_frequent')),
    ('std_scaler_ord', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('cont', cont_pipeline, continuous_features),
    ('ord', ord_pipeline, ordinal_features),
    ('nom', OneHotEncoder(), nominal_features),
])

X_train_processed_values = full_pipeline.fit_transform(X_train)

# Add columns names to build the processed dataframe 
region_ohe_features = list(full_pipeline.named_transformers_['nom'].get_feature_names())
column_names = continuous_features + ordinal_features + region_ohe_features
X_train_processed = pd.DataFrame(X_train_processed_values, columns=column_names)
# Drop one of the ohe features to limit correlations in the data set
X_train_processed.drop('x0_EUROPE', axis=1, inplace=True)
# Save the data
X_train_processed.join(y_train).to_csv('../data/processed/df_train_processed.csv')

