## Getting the dataframe ready for Machine Learning

In [1]:
# Jupyter Notebook with Matplotlib Inline
%matplotlib notebook
# Import required modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import math

In [2]:
# Read the cleaned datasets
train = pd.read_csv("train.csv", delimiter=',') 
test = pd.read_csv("test.csv", delimiter=',')

In [3]:
# Creating a duplicate dataframe to work on 
z = train.copy(deep = True)
df = train.copy(deep = True)

In [4]:
# Consolidate 'job', 'education' and 'month' variables based on percentage of positive and negative responses.
z['job'].replace(['blue-collar', 'services', 'entrepreneur', 'housemaid', 'self-employed', 'technician', 
                  'management', 'unknown', 'admin.', 'unemployed', 'retired', 'student'],
                   ['j1l4', 'j1l4', 'j1l4', 'j1l3', 'j1l3', 'j1l3', 'j1l3', 'j1l2', 'j1l2', 'j1l2', 'j1l1', 'j1l1'], 
                   inplace=True)

z['education'].replace(['basic.9y','basic.6y','basic.4y','high.school','professional.course','university.degree','unknown','illiterate'],
                      ['e1l4','e1l4','e1l3','e1l3','e1l3','e1l2','e1l2','e1l1'], 
                      inplace=True)

z['month'].replace(['may','jul','nov','aug','jun','apr','oct','sep','dec','mar'],
                      ['m1l3','m1l3','m1l3','m1l3','m1l3','m1l2','m1l1','m1l1','m1l1','m1l1'], 
                      inplace=True)

### 'Age' Variable
Binning the low and high outliers (<23 and >75) into a single bin respectively; and bins of 4 and 5 for values with in 24 and 75

In [11]:
# Single bin[0-20]....bins of 4 [24-60]....bins of 5 [60-75]....single bin [75-100] 
def final_test(a): 
    if(a>0 and a<=24):
        return "Cat 1"
    elif (a>24 and a<=31):
        return "Cat 2"
    elif (a>31 and a<=35):
        return "Cat 3"
    elif (a>35 and a<=41):
        return "Cat 4"
    elif (a>41 and a<=49):
        return "Cat 5"
    elif (a>49 and a<=60):
        return "Cat 6"
    elif (a>60 and a<=100):
        return "Cat 7"

z['age_cat'] = z.apply(lambda row: final_test(row['age']), axis=1)
df['age_cat'] = df.apply(lambda row: final_test(row['age']), axis=1)
# df_duration_bounds['age_cat'] = df_duration_bounds.apply(lambda row: final_test(row['age']), axis=1)
# df_employees_bounds['age_cat'] = df_duration_bounds.apply(lambda row: final_test(row['age']), axis=1)

### 'Marital' variable

All classes in this variable are similarly distributed.

<u>Note:</u>  Come back to this step while using Machine Leanring for variable or feature importance

In [12]:
# ..............All categories are almost similarly distributed (83%-90% for NOs)............
pd.crosstab(z.marital, z.y, normalize='index')

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,0.893209,0.106791
married,0.897718,0.102282
single,0.859706,0.140294
unknown,0.83871,0.16129


### 'Day' variable 

In [14]:
# Replaced day with 'weekday_1', 'weekday_2' and 'weekend' categories.
for dataframe in (z, df):
    dataframe['day_cat'] = dataframe['day'].copy(deep=True)
    dataframe['day_cat'].replace(['sum', 'sat', 'mon', 'tue', 'wed', 'thu', 'fri'],
                      ['weekend', 'weekend', 'weekday_1', 'weekday_1', 'weekday_1', 'weekday_2', 'weekday_2'], 
                      inplace=True)

### 'Duration' Variable
<u>Important note:</u> this attribute highly affects the output target (e.g., if duration=0 then y='no'). 

Yet, the duration is not known before a call is performed. 
Also, after the end of the call y is obviously known. 
Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

<u> Approach</u>: Create three dataframes, one with 'duration' variable, one without, and one with the variable and upper, lower bounds applied

In [15]:
# Duplicating with and without 'duration' column
df_duration_yes = z.copy(deep = True)
df_duration_no = z.copy(deep = True)

# # Duplicating and applying 'lower' and 'upper' bounds
# df_duration_bounds = z.copy(deep = True)
# df_employees_bounds = z.copy(deep = True)

 <u>Notes</u>: del df['column_name']
 
 Advantage of drop over del is that drop allows you to drop multiple columns at once, 
 perform the operation inplace or not, and also delete records along any axis (especially useful for a 3-D matrix

In [16]:
# # .......................................DROPPPING DURATION COLUMN ......................(IMP)

# # deleting duration column 
# # We are not dropping the column just yet. We will drop the column after the normalization and standardization
# df_duration_no.drop('duration',axis=1, inplace=True) #axis = 1 deletes column; axis = 0 delets rows

## OUTLIERS

Replace valid outliers with logarithmic transformation

Replace invalid outliers (human-error) with 90th percentile or upper bounds OR exclude the record from the dataframe. 

IQR - https://www.youtube.com/watch?v=dNHGVLXBTgI

(a) Applying Upper and Lower bounds to 'duration' and 'employees' variable 

In [17]:
# Upper and Lower bounds for 'duration' column
z['duration'] = z['duration'].apply(lambda x: int(math.floor(x / 10.0)) * 10 if(x%10<5) else int(math.ceil(x / 10.0)) * 10 )
z['employees'] = z['employees'].apply(lambda x: int(math.floor(x / 10.0)) * 10 if(x%10<5) else int(math.ceil(x / 10.0)) * 10 ) 

(b) Applying 90 percentiles and 10 percentiles for the lower and upper outliers 

In [98]:
# # Calculating the 90th percentile (uq) and 10th percentile (lq)
# duration_uq = int(z['duration'].quantile(0.90))
# duration_lq = int(z['duration'].quantile(0.10))

# dcontacts_uq = int(z['dcontacts'].quantile(0.90))
# dcontacts_lq = int(z['dcontacts'].quantile(0.10))

# pdays_uq = int(z['pdays'].quantile(0.90))
# pdays_lq = int(z['pdays'].quantile(0.10))

# pcontacts_uq = int(z['pcontacts'].quantile(0.90))
# pcontacts_lq = int(z['pcontacts'].quantile(0.10))

# evr_uq = int(z['evr'].quantile(0.90))
# evr_lq = int(z['evr'].quantile(0.10))

# cpi_uq = int(z['cpi'].quantile(0.90))
# cpi_lq = int(z['cpi'].quantile(0.10))

# cci_uq = int(z['cci'].quantile(0.90))
# cci_lq = int(z['cci'].quantile(0.10))

# euribor_uq = int(z['euribor'].quantile(0.90))
# euribor_lq = int(z['euribor'].quantile(0.10))

# employees_uq = int(z['employees'].quantile(0.90))
# employees_lq = int(z['employees'].quantile(0.10))

In [99]:
# # Replacing the outliers of respective variable
# def age_outliers(x):
#     if x['age'] < 23 and x['age'] > 0: return 23
#     elif x['age'] > 71 and x['age'] < 78: return 71 # median
#     elif x['age'] > 78 and x['age'] < 83: return 81
#     elif x['age'] > 83 and x['age'] < 100: return 86
#     else: return x['age']

# z['age_outliers'] = z.apply(age_outliers, axis=1)

# def duration_outliers(x):
#     if x['duration'] < duration_lq: return duration_lq
#     elif x['duration'] > duration_uq: return duration_uq
#     else: return x['duration']

# z['duration_outliers'] = z.apply(duration_outliers, axis=1)

# def dcontacts_outliers(x):
#     if x['dcontacts'] < dcontacts_lq: return dcontacts_lq
#     elif x['dcontacts'] > dcontacts_uq: return dcontacts_uq
#     else: return x['dcontacts']

# z['dcontacts_outliers'] = z.apply(dcontacts_outliers, axis=1)

# def pdays_outliers(x):
#     if x['pdays'] < pdays_lq: return pdays_lq
#     elif x['pdays'] > pdays_uq: return pdays_uq
#     else: return x['pdays']

# z['pdays_outliers'] = z.apply(pdays_outliers, axis=1)

# def pcontacts_outliers(x):
#     if x['pcontacts'] < pcontacts_lq: return pcontacts_lq
#     elif x['pcontacts'] > pcontacts_uq: return pcontacts_uq
#     else: return x['pcontacts']

# z['pcontacts_outliers'] = z.apply(pcontacts_outliers, axis=1)

# def evr_outliers(x):
#     if x['evr'] < evr_lq: return evr_lq
#     elif x['evr'] > evr_uq: return evr_uq
#     else: return x['evr']

# z['evr_outliers'] = z.apply(evr_outliers, axis=1)

# def cpi_outliers(x):
#     if x['cpi'] < cpi_lq: return cpi_lq
#     elif x['cpi'] > cpi_uq: return 94.465     # Returning median 
#     else: return x['cpi']

# z['cpi_outliers'] = z.apply(cpi_outliers, axis=1)

# def cci_outliers(x):
#     if x['cci'] < cci_lq: return cci_lq
#     elif x['cci'] > cci_uq: return cci_uq
#     else: return x['cci']

# z['cci_outliers'] = z.apply(cci_outliers, axis=1)

# def euribor_outliers(x):
#     if x['euribor'] < euribor_lq: return euribor_lq
#     elif x['euribor'] > euribor_uq: return euribor_uq # SHould I replace with mean-4.81; median-4.95 or max-5.04
#     else: return x['euribor']

# z['euribor_outliers'] = z.apply(euribor_outliers, axis=1)

# def employees_outliers(x):
#     if x['employees'] < employees_lq: return employees_lq
#     elif x['employees'] > employees_uq: return employees_uq # What should I replace it with?
#     # overall_mean - 5167, overall_median - 5191
#     # outlier_mean - 5228, outlier_median - 5228
#     # upper_bound (0.9 percentile) = 5228, lower_bound (0.1 percentile) - 4963
#     else: return x['employees']

# z['employees_outliers'] = z.apply(employees_outliers, axis=1)

In [18]:
duration_lq = int(z['duration'].quantile(0.05))
duration_uq = int(z['duration'].quantile(0.95))

z['duration_outliers'] = z['duration'].clip_lower(duration_lq)
z['duration_outliers'] = z['duration_outliers'].clip_upper(duration_uq)

dcontacts_lq = int(z['dcontacts'].quantile(0.05))
dcontacts_uq = int(z['dcontacts'].quantile(0.95))

z['dcontacts_outliers'] = z['dcontacts'].clip_lower(dcontacts_lq)
z['dcontacts_outliers'] = z['dcontacts_outliers'].clip_upper(dcontacts_uq)

pdays_lq = int(z['pdays'].quantile(0.05))
pdays_uq = int(z['pdays'].quantile(0.95))

z['pdays_outliers'] = z['pdays'].clip_lower(pdays_lq)
z['pdays_outliers'] = z['pdays_outliers'].clip_upper(pdays_uq)

pcontacts_lq = int(z['pcontacts'].quantile(0.05))
pcontacts_uq = int(z['pcontacts'].quantile(0.95))

z['pcontacts_outliers'] = z['pcontacts'].clip_lower(pcontacts_lq)
z['pcontacts_outliers'] = z['pcontacts_outliers'].clip_upper(pcontacts_uq)

evr_lq = int(z['evr'].quantile(0.05))
evr_uq = int(z['evr'].quantile(0.95))

z['evr_outliers'] = z['evr'].clip_lower(evr_lq)
z['evr_outliers'] = z['evr_outliers'].clip_upper(evr_uq)


cpi_lq = int(z['cpi'].quantile(0.05))
cpi_uq = int(z['cpi'].quantile(0.95))

z['cci_outliers'] = z['cci'].clip_lower(cpi_lq)
z['cci_outliers'] = z['cci_outliers'].clip_upper(cpi_uq)

cci_lq = int(z['cci'].quantile(0.05))
cci_uq = int(z['cci'].quantile(0.95))

z['cci_outliers'] = z['cci'].clip_lower(cci_lq)
z['cci_outliers'] = z['cci_outliers'].clip_upper(cci_uq)

euribor_lq = int(z['euribor'].quantile(0.05))
euribor_uq = int(z['euribor'].quantile(0.95))

z['euribor_outliers'] = z['euribor'].clip_lower(euribor_lq)
z['euribor_outliers'] = z['euribor_outliers'].clip_upper(euribor_uq)

employees_lq = int(z['employees'].quantile(0.05))
employees_uq = int(z['employees'].quantile(0.95))

z['employees_outliers'] = z['employees'].clip_lower(employees_lq)
z['employees_outliers'] = z['employees_outliers'].clip_upper(employees_uq)

(c) Apply Logarithmic transformations to invalid outliers (Not to the outliers but to all the entries in the numeric columns). 


Creating a new dataframe to apply logarithm transformations.
From all the numerical columns, logarithmic transformations can be applied to only a few since others have '0' and negative values

In [206]:
# z.astype(bool).sum(axis=0)      # Count of zeros in a columns
# z[z<0].count()                  # Count of negative values in each column

In [19]:
num = ['age','dcontacts','cpi','euribor','employees','duration_outliers','dcontacts_outliers','pdays_outliers','euribor_outliers','employees_outliers']

In [20]:
z_log = z.copy(deep=True)
for n in num:
    z_log[n] = np.log(z_log[n])

## <b>Data Rescaling</b>

Preprocessed data may contain attributes with a mixtures of scales for various quantities such as dollars, kilograms 
and sales volume.

Many machine learning methods expect or are more effective if the data attributes have the same scale. 
Two popular data scaling methods are normalization and standardization.

Data Normalization
Normalization refers to rescaling real valued numeric attributes into the range 0 and 1.

It is useful to scale the input attributes for a model that relies on the magnitude of values, 
such as distance measures used in k-nearest neighbors and in the preparation of coefficients in regression.

Data Standardization
Standardization refers to shifting the distribution of each attribute to have a mean of zero and a 
standard deviation of one (unit variance).

It is useful to standardize attributes for a model that relies on the distribution of attributes such as Gaussian processes.

Which Method To Use
It is hard to know whether rescaling your data will improve the performance of your algorithms before you apply them. 
If often can, but not always.

A good tip is to create rescaled copies of your dataset and race them against each other using your test harness 
and a handful of algorithms you want to spot check. This can quickly highlight the benefits (or lack there of) of 
rescaling your data with given models, and which rescaling method may be worthy of further investigation.

### Normalizing and Standardizing the data

NORMALIZATION
The process of converting all input values into a common scale usually between 0 and 1. 
It is not always a good idea to normalize the data since we might lose the minimum and maximum values.
But most often it is a good idea.
Advt: ML algorithms such as Linear Regression and SVM perform faster on normalized data.

STANDARDIZATION
Standardization refers to shifting the distribution of each attribute to have a mean of zero and a 
standard deviation of one (unit variance).

In [21]:
z_normalized = z.copy(deep = True)   # All changes applied
z_standardized = z.copy(deep = True) # All changes applied

df_duration_yes = z.copy(deep=True)
df_duration_no = z.copy(deep=True)

df_duration_yes_normalized = df_duration_yes.copy(deep = True)
df_duration_yes_standardized = df_duration_yes.copy(deep = True)

df_duration_no_normalized = df_duration_no.copy(deep = True)
df_duration_no_standardized = df_duration_no.copy(deep = True)

In [22]:
numerical = ['age','duration','dcontacts','pdays','pcontacts','evr','cpi','cci','euribor','employees','duration_outliers','dcontacts_outliers','pdays_outliers','pcontacts_outliers','evr_outliers','cci_outliers','euribor_outliers','employees_outliers']

In [24]:
for dataframe in (z_normalized, df_duration_yes_normalized, df_duration_no_normalized):
    for n in numerical:
        col = dataframe[[n]].values.astype(float)
        col_transformed = (preprocessing.MinMaxScaler()).fit_transform(col)
        dataframe[n+'_normalized'] = pd.DataFrame(col_transformed)

for dataframe in (z_standardized, df_duration_yes_standardized, df_duration_no_standardized):
    for n in numerical:
        col = dataframe[[n]].values.astype(float)
        col_transformed = (preprocessing.StandardScaler()).fit_transform(col)
        dataframe[n+'_standardized'] = pd.DataFrame(col_transformed)

Delete 'duration' column

In [None]:
# df_duration_no_standardized.drop('duration',axis=1, inplace=True) #axis = 1 deletes column; axis = 0 delets rows
# df_duration_no_normalized.drop('duration',axis=1, inplace=True)
# df_duration_no.drop('duration',axis=1, inplace=True)

In [27]:
df_duration_no_normalized.drop('duration',axis=1, inplace=True)
df_duration_no_standardized.drop('duration',axis=1, inplace=True)

## Dummy Variables

In [None]:
# for d in (z, df, z_normalized, z_standardized, df_duration_yes, df_duration_no, df_duration_yes_normalized, 
#                  df_duration_yes_standardized, df_duration_no_normalized, df_duration_no_standardized):
#     d = pd.get_dummies(d)

In [41]:
z = pd.get_dummies(z)
df = pd.get_dummies(df)
z_normalized = pd.get_dummies(z_normalized)
z_standardized = pd.get_dummies(z_standardized)
df_duration_yes = pd.get_dummies(df_duration_yes)
df_duration_no = pd.get_dummies(df_duration_no)
df_duration_yes_normalized = pd.get_dummies(df_duration_yes_normalized)
df_duration_yes_standardized = pd.get_dummies(df_duration_yes_standardized)
df_duration_no_normalized = pd.get_dummies(df_duration_no_normalized)
df_duration_no_standardized = pd.get_dummies(df_duration_no_standardized)

In [64]:
# Array of all the categorical variables in a dataframe.
test.select_dtypes(include=['O']).columns.values

array(['job', 'marital', 'education', 'default', 'housing', 'personal',
       'contact_type', 'month', 'day', 'poutcome', 'y'], dtype=object)

<u> Note </u>:
Using multiple conditions in Lambda functions: lambda x: x*10 if x<20 else (x**2 if x<4 else x+10)

<u> Warning </u> : More about 'SettingWithCopyWarning' warning.
https://www.dataquest.io/blog/settingwithcopywarning/