# Feature Scaling

In [1]:
# Some of this code is from 1nhee/space on github

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Create a DataFrame with some data.
data_train = pd.DataFrame({
    'age': [20, 25, 30, 35, 40],
    'income': [50000, 60000, 70000, 80000, 90000],
    'height' : [1.6, 1.7, 1.8, 1.5, 1.55]
})
data_test = pd.DataFrame({
    'age': [18, 5, 20, 30, 17],
    'income': [9000,8000,7000,6000,5000],
    'height' : [1.2, 2.0, 1.9, 1.5, 1.6]
})


In [2]:
data_train

Unnamed: 0,age,income,height
0,20,50000,1.6
1,25,60000,1.7
2,30,70000,1.8
3,35,80000,1.5
4,40,90000,1.55


In [3]:
data_test

Unnamed: 0,age,income,height
0,18,9000,1.2
1,5,8000,2.0
2,20,7000,1.9
3,30,6000,1.5
4,17,5000,1.6


In [4]:

# Scale the data using StandardScaler.
scaler1 = StandardScaler()
scaler1.fit(data_train[['age', 'income']])
data_train[['age_std_scale', 'income_std_scale']] = scaler1.transform(data_train[['age', 'income']])
data_test[['age_std_scale', 'income_std_scale']] = scaler1.transform(data_test[['age', 'income']])

# Scale the data using MinMaxScaler.
scaler2 = MinMaxScaler()
scaler2.fit(data_train[['age', 'income']])
data_train[['age_minmax_scale', 'income_minmax_scale']] = scaler2.transform(data_train[['age', 'income']])
data_test[['age_minmax_scale', 'income_minmax_scale']] = scaler2.transform(data_test[['age', 'income']])


In [5]:
data_train

Unnamed: 0,age,income,height,age_std_scale,income_std_scale,age_minmax_scale,income_minmax_scale
0,20,50000,1.6,-1.414214,-1.414214,0.0,0.0
1,25,60000,1.7,-0.707107,-0.707107,0.25,0.25
2,30,70000,1.8,0.0,0.0,0.5,0.5
3,35,80000,1.5,0.707107,0.707107,0.75,0.75
4,40,90000,1.55,1.414214,1.414214,1.0,1.0


In [6]:
data_test

Unnamed: 0,age,income,height,age_std_scale,income_std_scale,age_minmax_scale,income_minmax_scale
0,18,9000,1.2,-1.697056,-4.313351,-0.1,-1.025
1,5,8000,2.0,-3.535534,-4.384062,-0.75,-1.05
2,20,7000,1.9,-1.414214,-4.454773,0.0,-1.075
3,30,6000,1.5,0.0,-4.525483,0.5,-1.1
4,17,5000,1.6,-1.838478,-4.596194,-0.15,-1.125


In [7]:
# We can peak into the mean, the stdev of the std scaler
# (from the data before the transformation)
print (scaler1.mean_)
print (scaler1.scale_)

[3.e+01 7.e+04]
[7.07106781e+00 1.41421356e+04]


In [8]:
# We can peak into the min, the max of the minmax scaler
# (from the training data before the transformation)
print (scaler2.data_min_)
print (scaler2.data_max_)

[2.e+01 5.e+04]
[4.e+01 9.e+04]


In [9]:
# Data scaling is really easy, we could have done this "manually"
min_ = data_train[["age", "income"]].min()
max_ = data_train[["age", "income"]].max()
print (min_)
print (max_)

age          20
income    50000
dtype: int64
age          40
income    90000
dtype: int64


In [10]:
data_train[["age_minmax_scale2", "income_minmax_scale2"]] = (data_train[["age", "income"]]  - min_)/(max_ - min_)
data_train

Unnamed: 0,age,income,height,age_std_scale,income_std_scale,age_minmax_scale,income_minmax_scale,age_minmax_scale2,income_minmax_scale2
0,20,50000,1.6,-1.414214,-1.414214,0.0,0.0,0.0,0.0
1,25,60000,1.7,-0.707107,-0.707107,0.25,0.25,0.25,0.25
2,30,70000,1.8,0.0,0.0,0.5,0.5,0.5,0.5
3,35,80000,1.5,0.707107,0.707107,0.75,0.75,0.75,0.75
4,40,90000,1.55,1.414214,1.414214,1.0,1.0,1.0,1.0


In [11]:
data_test[["age_minmax_scale2", "income_minmax_scale2"]] = (data_test[["age", "income"]]  - min_)/(max_ - min_)
data_test

Unnamed: 0,age,income,height,age_std_scale,income_std_scale,age_minmax_scale,income_minmax_scale,age_minmax_scale2,income_minmax_scale2
0,18,9000,1.2,-1.697056,-4.313351,-0.1,-1.025,-0.1,-1.025
1,5,8000,2.0,-3.535534,-4.384062,-0.75,-1.05,-0.75,-1.05
2,20,7000,1.9,-1.414214,-4.454773,0.0,-1.075,0.0,-1.075
3,30,6000,1.5,0.0,-4.525483,0.5,-1.1,0.5,-1.1
4,17,5000,1.6,-1.838478,-4.596194,-0.15,-1.125,-0.15,-1.125


# Outlier detection

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
                   'b': [8,6,4,2, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]})

data


Unnamed: 0,a,b
0,1,8
1,2,6
2,3,4
3,4,2
4,5,10
5,6,12
6,7,14
7,8,16
8,9,18
9,10,20


## Outlier detection using z-score

In [13]:

# z-score done using standard scaler to get z-scores
scaler = StandardScaler()
scaler.fit(data[['a', 'b']])
data[['a_scaled', 'b_scaled']] = scaler.transform(data[['a', 'b']])

print ("Data after z-score computation (standard scaler")
data

Data after z-score computation (standard scaler


Unnamed: 0,a,b,a_scaled,b_scaled
0,1,8,-1.626978,-0.976187
1,2,6,-1.410048,-1.193118
2,3,4,-1.193118,-1.410048
3,4,2,-0.976187,-1.626978
4,5,10,-0.759257,-0.759257
5,6,12,-0.542326,-0.542326
6,7,14,-0.325396,-0.325396
7,8,16,-0.108465,-0.108465
8,9,18,0.108465,0.108465
9,10,20,0.325396,0.325396


In [14]:
# Mark all those with zscore >1.6 or <-1.6 as outliers (missing data)
# Always safer to create a new column rather than working "in place"
data["a_outliers_removed"] = data["a"].where(data["a_scaled"].abs()<1.6)
data["b_outliers_removed"] = data["b"].where(data["b_scaled"].abs()<1.6)
data

Unnamed: 0,a,b,a_scaled,b_scaled,a_outliers_removed,b_outliers_removed
0,1,8,-1.626978,-0.976187,,8.0
1,2,6,-1.410048,-1.193118,2.0,6.0
2,3,4,-1.193118,-1.410048,3.0,4.0
3,4,2,-0.976187,-1.626978,4.0,
4,5,10,-0.759257,-0.759257,5.0,10.0
5,6,12,-0.542326,-0.542326,6.0,12.0
6,7,14,-0.325396,-0.325396,7.0,14.0
7,8,16,-0.108465,-0.108465,8.0,16.0
8,9,18,0.108465,0.108465,9.0,18.0
9,10,20,0.325396,0.325396,10.0,20.0


In [15]:
# Let's remove the rows with missing data
data_rows_removed = data.dropna(axis=0) # dropping rows (you can also remove columns with axis=1)
data_rows_removed

Unnamed: 0,a,b,a_scaled,b_scaled,a_outliers_removed,b_outliers_removed
1,2,6,-1.410048,-1.193118,2.0,6.0
2,3,4,-1.193118,-1.410048,3.0,4.0
4,5,10,-0.759257,-0.759257,5.0,10.0
5,6,12,-0.542326,-0.542326,6.0,12.0
6,7,14,-0.325396,-0.325396,7.0,14.0
7,8,16,-0.108465,-0.108465,8.0,16.0
8,9,18,0.108465,0.108465,9.0,18.0
9,10,20,0.325396,0.325396,10.0,20.0
10,11,22,0.542326,0.542326,11.0,22.0
11,12,24,0.759257,0.759257,12.0,24.0


## Outlier detection using IQR

In [16]:
# Let's create some fresh data first
data = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 500, 12, 13, 14, 15, 16],
                   'b': [8,6,4,2, 10, -100, 14, 16, 18, 20, 22, 24, 26, 28, 30, 100]})

# Make sure you understand what is printed
quantiles = data.quantile([0.25, 0.75])
quantiles

Unnamed: 0,a,b
0.25,4.75,7.5
0.75,13.25,24.5


In [17]:
IQR = quantiles.loc[0.75] - quantiles.loc[0.25]
IQR

a     8.5
b    17.0
dtype: float64

In [18]:
# Let's do IQR outlier removal with alpha = 1.5
# This time let's do it in-place
data["a"] = data["a"].where(
    (data["a"] <= quantiles["a"][0.75] + IQR["a"] * 1.5) &
    (data["a"] >= quantiles["a"][0.25] - IQR["a"] * 1.5))
data["a"] = data["b"].where(
    (data["b"] <= quantiles["b"][0.75] + IQR["b"] * 1.5) &
    (data["b"] >= quantiles["b"][0.25] - IQR["b"] * 1.5))

In [19]:
data

Unnamed: 0,a,b
0,8.0,8
1,6.0,6
2,4.0,4
3,2.0,2
4,10.0,10
5,,-100
6,14.0,14
7,16.0,16
8,18.0,18
9,20.0,20


In [20]:
data_rows_removed = data.dropna(axis=0)
data_rows_removed

Unnamed: 0,a,b
0,8.0,8
1,6.0,6
2,4.0,4
3,2.0,2
4,10.0,10
6,14.0,14
7,16.0,16
8,18.0,18
9,20.0,20
10,22.0,22


# Missing Value Handling

In [21]:
# Let's create some fresh data first
data = pd.DataFrame({
    'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 12, 13, 14, 15, 16],
    'b': [8,6,4,2, 10, None, 14, 16, 18, 20, 22, 24, 26, 28, 30, None],
    'c' : [100, None, None, None, 5, None, None, None,100, None, None, None, 5, None, None, None]})



In [22]:
# Note that you can compute stats on columns with missing data
# The missing data will be simply ignored
print(data['a'].mean())
print(data['b'].std())

8.333333333333334
9.21060067721779


## Removing columns with rate of missing values threshold


In [23]:
# How many missing values in each column?
num_missing = data.isna().sum()
num_missing

a     1
b     2
c    12
dtype: int64

In [24]:
data_cols_removed = data.loc[:, num_missing<=6]
data_cols_removed

Unnamed: 0,a,b
0,1.0,8.0
1,2.0,6.0
2,3.0,4.0
3,4.0,2.0
4,5.0,10.0
5,6.0,
6,7.0,14.0
7,8.0,16.0
8,9.0,18.0
9,10.0,20.0


## Imputation with mean and median

In [25]:
data_cols_removed_mean_imp = data_cols_removed.fillna(data_cols_removed.mean())
data_cols_removed_mean_imp

Unnamed: 0,a,b
0,1.0,8.0
1,2.0,6.0
2,3.0,4.0
3,4.0,2.0
4,5.0,10.0
5,6.0,16.285714
6,7.0,14.0
7,8.0,16.0
8,9.0,18.0
9,10.0,20.0


In [26]:
data_cols_removed_median_imp = data_cols_removed.fillna(data_cols_removed.median())
data_cols_removed_median_imp

Unnamed: 0,a,b
0,1.0,8.0
1,2.0,6.0
2,3.0,4.0
3,4.0,2.0
4,5.0,10.0
5,6.0,17.0
6,7.0,14.0
7,8.0,16.0
8,9.0,18.0
9,10.0,20.0


# Feature Transformations

In [27]:
# Let's create some fresh data first
data = pd.DataFrame({
    'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 12, 13, 14, 15, 16],
    'b': [8,6,4,2, 10, 19, 14, 16, 18, 20, 22, 24, 26, 28, 30, 21]})

In [28]:
# Logarithm, Box-Cox transformation with lambda-2
data["log_a"] = np.log(data["a"])
data["box_cox2_b"] = (np.power(data["b"], 2) - 1) / 2
data
# could also use box-cox from here:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html


Unnamed: 0,a,b,log_a,box_cox2_b
0,1,8,0.0,31.5
1,2,6,0.693147,17.5
2,3,4,1.098612,7.5
3,4,2,1.386294,1.5
4,5,10,1.609438,49.5
5,6,19,1.791759,180.0
6,7,14,1.94591,97.5
7,8,16,2.079442,127.5
8,9,18,2.197225,161.5
9,10,20,2.302585,199.5


In [29]:
# Bucketization/discretization
data["a_bucketized"] = pd.cut(data["a"], bins=5) # number of bins, equally spaced
data

Unnamed: 0,a,b,log_a,box_cox2_b,a_bucketized
0,1,8,0.0,31.5,"(0.985, 4.0]"
1,2,6,0.693147,17.5,"(0.985, 4.0]"
2,3,4,1.098612,7.5,"(0.985, 4.0]"
3,4,2,1.386294,1.5,"(0.985, 4.0]"
4,5,10,1.609438,49.5,"(4.0, 7.0]"
5,6,19,1.791759,180.0,"(4.0, 7.0]"
6,7,14,1.94591,97.5,"(4.0, 7.0]"
7,8,16,2.079442,127.5,"(7.0, 10.0]"
8,9,18,2.197225,161.5,"(7.0, 10.0]"
9,10,20,2.302585,199.5,"(7.0, 10.0]"


In [30]:
# Bucketization/discretization
# Specifying the bin endpoints
data["a_bucketized2"] = pd.cut(data["a"], bins=[-100,3.5, 9.5, 100])
data

Unnamed: 0,a,b,log_a,box_cox2_b,a_bucketized,a_bucketized2
0,1,8,0.0,31.5,"(0.985, 4.0]","(-100.0, 3.5]"
1,2,6,0.693147,17.5,"(0.985, 4.0]","(-100.0, 3.5]"
2,3,4,1.098612,7.5,"(0.985, 4.0]","(-100.0, 3.5]"
3,4,2,1.386294,1.5,"(0.985, 4.0]","(3.5, 9.5]"
4,5,10,1.609438,49.5,"(4.0, 7.0]","(3.5, 9.5]"
5,6,19,1.791759,180.0,"(4.0, 7.0]","(3.5, 9.5]"
6,7,14,1.94591,97.5,"(4.0, 7.0]","(3.5, 9.5]"
7,8,16,2.079442,127.5,"(7.0, 10.0]","(3.5, 9.5]"
8,9,18,2.197225,161.5,"(7.0, 10.0]","(3.5, 9.5]"
9,10,20,2.302585,199.5,"(7.0, 10.0]","(9.5, 100.0]"


In [31]:
# interaction variables
data["ab"] = data["a"] * data["b"]
data

Unnamed: 0,a,b,log_a,box_cox2_b,a_bucketized,a_bucketized2,ab
0,1,8,0.0,31.5,"(0.985, 4.0]","(-100.0, 3.5]",8
1,2,6,0.693147,17.5,"(0.985, 4.0]","(-100.0, 3.5]",12
2,3,4,1.098612,7.5,"(0.985, 4.0]","(-100.0, 3.5]",12
3,4,2,1.386294,1.5,"(0.985, 4.0]","(3.5, 9.5]",8
4,5,10,1.609438,49.5,"(4.0, 7.0]","(3.5, 9.5]",50
5,6,19,1.791759,180.0,"(4.0, 7.0]","(3.5, 9.5]",114
6,7,14,1.94591,97.5,"(4.0, 7.0]","(3.5, 9.5]",98
7,8,16,2.079442,127.5,"(7.0, 10.0]","(3.5, 9.5]",128
8,9,18,2.197225,161.5,"(7.0, 10.0]","(3.5, 9.5]",162
9,10,20,2.302585,199.5,"(7.0, 10.0]","(9.5, 100.0]",200


In [32]:
# notice that the bucketized columns are categorical
data.dtypes

a                   int64
b                   int64
log_a             float64
box_cox2_b        float64
a_bucketized     category
a_bucketized2    category
ab                  int64
dtype: object

# Dealing with categorical variables

In [33]:
data = pd.DataFrame({
    'boro': ['Brooklyn', 'Bronx', 'Bronx', 'Manhattan', 'Queens', 'Brooklyn'],
    'salary' : [1000, 2000, 3000, 4000, 5000, 6000],
    'satisfaction' : [4,3,3,5,1,2]})
data.dtypes

boro            object
salary           int64
satisfaction     int64
dtype: object

In [34]:
# First, we want to cast the variable to type category
data["boro"] = data["boro"].astype("category")
data["satisfaction"] = data["satisfaction"].astype("category")
data.dtypes

boro            category
salary             int64
satisfaction    category
dtype: object

In [35]:
# Categories inferred from data.
data["boro"].cat.categories

Index(['Bronx', 'Brooklyn', 'Manhattan', 'Queens'], dtype='object')

In [36]:
# Notice that a boro named "Staten Island" is missing from the data.
# In cases where we want to determine the categories in advance
# (before we see the data), the right way to do this is as follows
boro_type = pd.CategoricalDtype(categories=['Manhattan', 'Bronx', 'Brooklyn', "Queens", "Staten Island"])
data["boro"] = data["boro"].astype(boro_type)

In [37]:
# ordinal encoding
# Note that the encoding is based on the order that we determined in the
# previous cell
data["boro_ordinal"] = data["boro"].cat.codes
data

Unnamed: 0,boro,salary,satisfaction,boro_ordinal
0,Brooklyn,1000,4,2
1,Bronx,2000,3,1
2,Bronx,3000,3,1
3,Manhattan,4000,5,0
4,Queens,5000,1,3
5,Brooklyn,6000,2,2


In [38]:
# One-hot encoding
data_dummies = pd.get_dummies(data, columns=['boro', 'satisfaction'])
data_dummies

Unnamed: 0,salary,boro_ordinal,boro_Manhattan,boro_Bronx,boro_Brooklyn,boro_Queens,boro_Staten Island,satisfaction_1,satisfaction_2,satisfaction_3,satisfaction_4,satisfaction_5
0,1000,2,False,False,True,False,False,False,False,False,True,False
1,2000,1,False,True,False,False,False,False,False,True,False,False
2,3000,1,False,True,False,False,False,False,False,True,False,False
3,4000,0,True,False,False,False,False,False,False,False,False,True
4,5000,3,False,False,False,True,False,True,False,False,False,False
5,6000,2,False,False,True,False,False,False,True,False,False,False


In [39]:
# Impact (Stats) encoding
stats = data['salary'].groupby(data['boro']).agg(['mean'])
stats

Unnamed: 0_level_0,mean
boro,Unnamed: 1_level_1
Manhattan,4000.0
Bronx,2500.0
Brooklyn,3500.0
Queens,5000.0
Staten Island,


In [40]:
mapper = {s : stats.loc[s,'mean'] for s in stats.index }
data["boro_impact"] = data["boro"].map(mapper)

In [41]:
data

Unnamed: 0,boro,salary,satisfaction,boro_ordinal,boro_impact
0,Brooklyn,1000,4,2,3500.0
1,Bronx,2000,3,1,2500.0
2,Bronx,3000,3,1,2500.0
3,Manhattan,4000,5,0,4000.0
4,Queens,5000,1,3,5000.0
5,Brooklyn,6000,2,2,3500.0


# Feature Selection (Filter Method)

In [42]:
# Let's create synthetic data with label 0,1,2,...99
# x1 the same as the label plus some small noise
# x2 is minus the label, plus some small noise
# x3 is simply random
data = pd.DataFrame({
    'x1' : np.arange(100) + np.random.rand(100) * 10,
    'x2' : -np.arange(100) + np.random.rand(100) * 20,
    'x3' : np.random.rand(100),
    'y' : np.arange(100)
})
data.head()

Unnamed: 0,x1,x2,x3,y
0,9.067614,15.458216,0.106081,0
1,6.579719,5.561896,0.602963,1
2,5.143018,-1.890308,0.663178,2
3,4.264153,0.340428,0.298402,3
4,5.310807,1.954611,0.210284,4


## Let's compute different types of correlation between all features, and y

In [43]:
pearson = data.corr(method='pearson')
pearson

Unnamed: 0,x1,x2,x3,y
x1,1.0,-0.975824,0.024095,0.994594
x2,-0.975824,1.0,-0.023781,-0.980053
x3,0.024095,-0.023781,1.0,0.038034
y,0.994594,-0.980053,0.038034,1.0


In [44]:
spearman = data.corr(method="spearman")
spearman

Unnamed: 0,x1,x2,x3,y
x1,1.0,-0.97751,0.025659,0.994995
x2,-0.97751,1.0,-0.02267,-0.981434
x3,0.025659,-0.02267,1.0,0.037768
y,0.994995,-0.981434,0.037768,1.0


In [45]:
# Now we can rank the features x1,x2,x3 in decreasing abs-value-of-corr-with-y
# order.
# (You can do the same thing using Pearson)
sorted = spearman['y'].abs().sort_values(ascending=False)
sorted

y     1.000000
x1    0.994995
x2    0.981434
x3    0.037768
Name: y, dtype: float64

In [46]:
# Select the first two.  Don't forget to skip the first in the sorted list, which
# is y itself.
chosen_features = sorted.iloc[1:3].index
chosen_features

Index(['x1', 'x2'], dtype='object')

# Handling Imbalanced Data

In [47]:
data = pd.DataFrame({
    'x' : np.random.rand(1000),
    'y' : (np.random.rand(1000) > 0.9) # 90% False, 10% True
})
data['y'].value_counts()

y
False    885
True     115
Name: count, dtype: int64

In [48]:
# First let's create arrays of false and true indices
false_indices = data.index[data['y']==False]
true_indices = data.index[data['y']==True]

In [49]:
# Subsample the abundant class
# In next line, replace=False means that we don't allow repeted samples
subsampled_false_indices = np.random.choice(false_indices, size = 100,replace=False)
subsampled_data = pd.concat([data.loc[subsampled_false_indices], data.loc[true_indices]])
subsampled_data['y'].value_counts()

y
True     115
False    100
Name: count, dtype: int64

In [50]:
#Oversample the rare class
# In the next line, replace has to be True because there is no way to
# create more positive samples without repetition.
oversampled_true_indices = np.random.choice(true_indices, size = 1000,replace=True)
oversampled_data = pd.concat([data.loc[false_indices], data.loc[oversampled_true_indices]])
oversampled_data['y'].value_counts()

y
True     1000
False     885
Name: count, dtype: int64

# Splitting to Train/Val/Test

In [51]:
from sklearn.model_selection import train_test_split
data = pd.DataFrame({
    'x1' : np.random.rand(1000),
    'x2' : np.random.rand(1000),
    'y' : (np.random.rand(1000) > 0.5)
})
train, test = train_test_split(data, test_size=0.1, random_state=111)

In [52]:
data.shape

(1000, 3)

In [53]:
train.shape

(900, 3)

In [54]:
test.shape

(100, 3)

In [55]:
# Let's also get a validation set
train, val = train_test_split(train, test_size=0.1, random_state=222)

In [56]:
train.shape

(810, 3)

In [57]:
val.shape

(90, 3)

In [58]:
# We can also do this by shuffling and the slicing
data_shuffled = data.sample(frac=1) # random shuffle in Pandas
train = data_shuffled.iloc[:700]
val = data_shuffled.iloc[700:850]
test = data_shuffled.iloc[850:]

In [59]:
train.shape

(700, 3)

In [60]:
val.shape

(150, 3)

In [61]:
test.shape

(150, 3)