# Feature Engineering

In [None]:
### Creating bunch of datetime column

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
s = pd.date_range('2020-01-01', '2020-01-05', freq='10H').to_series()
s

2020-01-01 00:00:00   2020-01-01 00:00:00
2020-01-01 10:00:00   2020-01-01 10:00:00
2020-01-01 20:00:00   2020-01-01 20:00:00
2020-01-02 06:00:00   2020-01-02 06:00:00
2020-01-02 16:00:00   2020-01-02 16:00:00
2020-01-03 02:00:00   2020-01-03 02:00:00
2020-01-03 12:00:00   2020-01-03 12:00:00
2020-01-03 22:00:00   2020-01-03 22:00:00
2020-01-04 08:00:00   2020-01-04 08:00:00
2020-01-04 18:00:00   2020-01-04 18:00:00
Freq: 10H, dtype: datetime64[ns]

In [3]:
# create some features based on datetime
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
}

In [4]:
pd.DataFrame(data=features)

Unnamed: 0,dayofweek,dayofyear,hour,is_leap_year,quarter,weekofyear
0,2,1,0,True,1,1
1,2,1,10,True,1,1
2,2,1,20,True,1,1
3,3,2,6,True,1,1
4,3,2,16,True,1,1
5,4,3,2,True,1,1
6,4,3,12,True,1,1
7,4,3,22,True,1,1
8,5,4,8,True,1,1
9,5,4,18,True,1,1


In [5]:
# Create Polynomial Features

In [7]:
df = pd.DataFrame(
    np.random.rand(100,2),
    columns = [f"f_{i}" for i in range(1,3)]
)
df

Unnamed: 0,f_1,f_2
0,0.399324,0.292520
1,0.060391,0.546471
2,0.979883,0.554564
3,0.272559,0.420075
4,0.196427,0.915723
...,...,...
95,0.522244,0.324493
96,0.465360,0.020053
97,0.308104,0.753399
98,0.302488,0.341839


In [None]:
# Create two degree polynomail using sklearn

In [10]:
pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only=False,
    include_bias=False
)

# fit the feature
pf.fit(df)

PolynomialFeatures(degree=2, include_bias=False, interaction_only=False,
                   order='C')

In [11]:
# create polynomial feature
poly_feats = pf.transform(df)

In [12]:
# create a df with all the features
num_feats = poly_feats.shape[1]
print(f"num feats {num_feats}")

num feats 5


In [14]:
df_transformed = pd.DataFrame(
    poly_feats,
    columns=[f"f_{i}" for i in range(1, num_feats + 1)]
)

In [15]:
df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.399324,0.292520,0.159460,0.116810,0.085568
1,0.060391,0.546471,0.003647,0.033002,0.298630
2,0.979883,0.554564,0.960171,0.543408,0.307542
3,0.272559,0.420075,0.074288,0.114495,0.176463
4,0.196427,0.915723,0.038584,0.179873,0.838549
...,...,...,...,...,...
95,0.522244,0.324493,0.272739,0.169464,0.105295
96,0.465360,0.020053,0.216560,0.009332,0.000402
97,0.308104,0.753399,0.094928,0.232125,0.567609
98,0.302488,0.341839,0.091499,0.103402,0.116854


In [16]:
#  Binning


In [17]:
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df

Unnamed: 0,f_1,f_2,f_bin_10
0,0.399324,0.292520,3
1,0.060391,0.546471,0
2,0.979883,0.554564,9
3,0.272559,0.420075,2
4,0.196427,0.915723,1
...,...,...,...
95,0.522244,0.324493,5
96,0.465360,0.020053,4
97,0.308104,0.753399,2
98,0.302488,0.341839,2


In [18]:
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.399324,0.292520,3,37
1,0.060391,0.546471,0,2
2,0.979883,0.554564,9,97
3,0.272559,0.420075,2,24
4,0.196427,0.915723,1,16
...,...,...,...,...
95,0.522244,0.324493,5,50
96,0.465360,0.020053,4,44
97,0.308104,0.753399,2,28
98,0.302488,0.341839,2,27


In [5]:
## How to Identity High Variance Features and Reduce their Variance

d = {"f_1" : [0.143, 0.421, 0.224, 0.859, 0.082],
    "f_2" : [0.286, 0.967, 0.075, 0.652, 0.662],
    "f_3" : [8048, 7433, 2289, 1153, 2201]
}
new_df = pd.DataFrame(d)
print(f"f1 variance {new_df.f_1.var()}")
print(f"f2 variance {new_df.f_2.var()}")
print(f"f3 variance {new_df.f_3.var()}")

print("We can see variance of f3 is way higher than other features")

f1 variance 0.09864569999999999
f2 variance 0.12245629999999999
f3 variance 10547112.2
We can see variance of f3 is way higher than other features


In [7]:
# To reduce variance, apply Log_transformation

"""
Log tranformation

The log transformation is, arguably, the most popular among the different types of transformations used to transform skewed data to approximately conform to normality.

If the original data follows a log-normal distribution or approximately so, then the log-transformed data follows a normal or near normal distribution.

"""

new_df.f_3.apply(lambda x: np.log(1 + x)).var()

# By using this transformation, we have reduced the variance of this feature significantly, and it is pretty much on the same scale as the other features.

0.7132366590543565

Unnamed: 0,f_1,f_2,f_3
0,0.143,0.286,8048
1,0.421,0.967,7433
2,0.224,0.075,2289
3,0.859,0.652,1153
4,0.082,0.662,2201
