In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Transformation

This code covers some techniques to transforming data according to the data properties, including log transformation, clipping methods, minmax scaler, standard scaler and robust scaler. Please visit article "[Data Transformation and Feature Engineering](https://towardsdatascience.com/data-transformation-and-feature-engineering-e3c7dfbb4899)" for step by step guide or visit [my website](http://www.visual-design.net) for more articles like this. 

![Data Transformation Cheatsheet](https://miro.medium.com/max/1400/1*Tjy4bJ_YB_Lbx4fP-Tldzw.png)

In [None]:
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = pd.read_csv("../input/marketing-data/marketing_data.csv")
df.head()

# Feature Engineering

In [None]:
# 1. Transform Year_Birth into Age
from datetime import date
df['Age'] = date.today().year - df['Year_Birth']

# 2. Transform Dt_Customer into Enrollment_Length
df['Year'] = pd.DatetimeIndex(df['Dt_Customer']). year
df['Enrollment_Length'] = date.today().year - df['Year']

# 3. Transform Currency format into numbers
df['Income$'] = df[' Income '].str.replace(',', '').str.replace('$', '').str.replace('.', '').fillna(0).astype(int)
df['Income_M$'] = df['Income$'].apply(lambda x: round(x/1000000, 2))
print(df[['Income_M$', 'Income$']])

# EDA - Histogram

In [None]:
# select numeric variables
numeric_vars =  ['Age', 'Enrollment_Length', 'Income_M$', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# create histograms for each numeric variable
fig = plt.figure(figsize=(24, 15))
for i in range(len(numeric_vars)):
    var = numeric_vars[i]
    sub = fig.add_subplot(3, 5, i + 1)
    sub.set_xlabel(var)
    df[var].plot(kind = 'hist')

# Log Transformation - Right Skewed Data

In [None]:
## log transformation - power law distribution ##
log_var = ['Income_M$', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 
           'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 
           'NumCatalogPurchases']

fig = plt.figure(figsize = (24,10))

for j in range(len(log_var)):
    var = log_var[j]
    transformed = "log_" + var
    df[transformed] = np.log10(df[var]+1)

    sub = fig.add_subplot(2, 5, j + 1)
    sub.set_xlabel(var)
    df[transformed].plot(kind = 'hist')

# Clipping Method

In [None]:
## clipping methods - handle outliers ##

clip_var = ['Age', 'NumWebVisitsMonth']

for i in clip_var:
    transformed = 'clipped_'+ i

    # upper limit - .95 quantile
    upper_limit = df[i].quantile(0.95)

    # lower limit - .05 quantile
    lower_limit = df[i].quantile(0.05)

    df[transformed] = df[i].clip(lower_limit, upper_limit, axis = 0)
    
    print(df[i].describe())
    print(df[transformed].describe())
    
    plt.figure(figsize = (5,5))
    df[i].plot(kind = 'hist')
    
    plt.figure(figsize = (5,5))
    df[transformed].plot(kind = 'hist')

# Data Scaling
- MinMax Scaler
- Standard Scaler
- Robust Scaler

In [None]:
## data scaling methods ##

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

scale_var = ['Enrollment_Length', 'Recency', 'NumStorePurchases', 'clipped_Age', 'clipped_NumWebVisitsMonth']
scalers_list = [StandardScaler(), RobustScaler(), MinMaxScaler()]

for i in range(len(scalers_list)):
    scaler = scalers_list[i]
    fig = plt.figure(figsize = (26, 5))
    plt.title(scaler, fontsize = 20)
    for j in range(len(scale_var)):
        var = scale_var[j]
        scaled_var = "scaled_" + var
        model = scaler.fit(df[var].values.reshape(-1,1))
        df[scaled_var] = model.transform(df[var].values.reshape(-1, 1))

        sub = fig.add_subplot(1, 5, j + 1)
        sub.set_xlabel(var)
        df[scaled_var].plot(kind = 'hist')

In [None]:
# visualize data after transformation

transformed_var = ['scaled_Recency', 'scaled_NumStorePurchases', 'scaled_clipped_Age', 'scaled_clipped_NumWebVisitsMonth', 'scaled_Enrollment_Length','log_Income_M$', 'log_MntWines', 'log_MntFruits', 'log_MntMeatProducts', 'log_MntFishProducts', 
           'log_MntSweetProducts', 'log_MntGoldProds', 'log_NumDealsPurchases', 'log_NumWebPurchases', 
           'log_NumCatalogPurchases']

fig = plt.figure(figsize=(24, 15))
for i in range(len(transformed_var)):
    var = transformed_var[i]
    sub = fig.add_subplot(3, 5, i + 1)
    sub.set_xlabel(var)
    df[var].plot(kind = 'hist')