# Feature engineering and selection

In this notebook the follow tasks will be performed

1. Dataset loading
2. Dataset split
3. Continuous vs discrete variables
4. Discrete variables reordering
5. Continuous Target Transformation
7. Removing Correlated variables
8. Feature Scaling
8. Feature selection by Lasso
9. Saving processed dataset

## 1. Dataset loading

In [75]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [77]:
df = pd.read_csv('../data/completion_rate.csv')

In [78]:
df.head()

Unnamed: 0,form_id,views,submissions,feat_01,feat_02,feat_03,feat_04,feat_05,feat_06,feat_07,...,feat_38,feat_39,feat_40,feat_41,feat_42,feat_43,feat_44,feat_45,feat_46,feat_47
0,1113027,33,27,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0
1,1115313,147,111,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1115907,528,136,0.0,1.0,0.0,0.0,1.0,0.0,6.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,30.0
3,1116299,55,21,0.0,2.0,0.0,0.0,0.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0
4,1120373,62,54,0.0,0.0,0.0,0.0,1.0,0.0,4.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,11.0,7.0,21.0


Let's add the target column and remove unwanted columns from the dataset

In [79]:
df['completion_ratio'] = df['submissions']/df['views']
selected_columns = list(df.columns)
selected_columns.pop(selected_columns.index('form_id'))
selected_columns.pop(selected_columns.index('views'))
selected_columns.pop(selected_columns.index('submissions'))
df = df.loc[:,selected_columns]

In [80]:
target_name = 'completion_ratio'
features = list(df.columns)
_  = features.pop(features.index(target_name))

Let's sample the dataset for the analysis to finish on time

In [81]:
df_original = df.copy()
print("original dataset length",len(df_original))
df = df_original.sample(frac=0.4)
print("sampled dataset length",len(df))

original dataset length 1031284
sampled dataset length 412514


## 2. Dataset split

In [82]:

X_train, X_test, y_train, y_test = train_test_split(
    df,
    df['completion_ratio'],
    test_size=0.1,
    random_state=7,
)  

X_train = X_train.copy()
X_test = X_test.copy()
X_train.shape, X_test.shape

((371262, 48), (41252, 48))

In [83]:
datasets = {
    'X_train': X_train.copy(),
    'X_test': X_test.copy(),
}


## 3. Continuous vs discrete variables

Let's determine which are continuous and discrete variables in the dataset.

In [84]:
discrete_vars = []
for var in df.columns:
    #print(var,len(df[var].unique()))
    if len(df[var].unique()) == len(df):
        print("found id or continuous var",var)
    elif df[var].dtypes == 'int64' and len(df[var].unique()) < 20:
        print("found possible discrete var",var)
        discrete_vars.append(var)
    elif len(df[var].unique()) < 20:
        print("found possible discrete var with misleading type",var, len(df[var].unique()))
        discrete_vars.append(var)
        
continuous_vars = [ col for col in df.columns if col not in discrete_vars]
print("\ndiscrete vars", discrete_vars)
print("\ncontinuous vars", continuous_vars)

found possible discrete var with misleading type feat_01 2
found possible discrete var with misleading type feat_04 4
found possible discrete var with misleading type feat_08 13
found possible discrete var with misleading type feat_10 10
found possible discrete var with misleading type feat_13 17
found possible discrete var with misleading type feat_15 10
found possible discrete var with misleading type feat_20 2
found possible discrete var with misleading type feat_35 14
found possible discrete var with misleading type feat_44 19

discrete vars ['feat_01', 'feat_04', 'feat_08', 'feat_10', 'feat_13', 'feat_15', 'feat_20', 'feat_35', 'feat_44']

continuous vars ['feat_02', 'feat_03', 'feat_05', 'feat_06', 'feat_07', 'feat_09', 'feat_11', 'feat_12', 'feat_14', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_36', 'feat_37', 'feat_38', 

## 4. Discrete variables reordering

In [85]:

def assign_encoding_id(variable, x, encoding_dict):
    try:
        return encoding_dict[variable][x]
    except:
        encoding_dict[variable][x] = list(encoding_dict[variable].keys())[-1]+1
        return encoding_dict[variable][x]
    
def discrete_reordering(X_train,X_test, discrete_vars ):
    encoding_dict = {}
    for variable in discrete_vars:    
        # if we fit only with training data, then the splits must be 
        # stratified to each of the discrete vars that we are going
        # to reorder! But this seems not possible as stratify does it by target classe only
        # we will probably leak info into the model 
        # or do some other stuff with labels non seen like assing a new id on top of the rest
        ordered_labels = X_train.groupby([
                variable])['completion_ratio'].median().sort_values().index
        ordinal_labels = {k: i for i, k in enumerate(ordered_labels, 0)}
        # save the encoding dict
        encoding_dict[variable] = ordinal_labels
        X_train[variable] = X_train[variable].apply(lambda x: encoding_dict[variable][x])
        X_test[variable] = X_test[variable].apply(lambda x: assign_encoding_id(variable,x, encoding_dict) )
    return X_train, X_test

new_modification_name = 'dr'
old_keys = list(datasets.keys())
for k in old_keys:
    if k.find('train')>-1:
        k_old_train = k
        k_old_test = k.replace('train','test')
        k_train = k+'_dr'
        k_test = k_train.replace('train','test')
        datasets[k_train], datasets[k_test] = discrete_reordering(datasets[k_old_train].copy(), datasets[k_old_test].copy(), discrete_vars)

datasets.keys()

dict_keys(['X_train', 'X_test', 'X_train_dr', 'X_test_dr'])

## 5. Target transformation

In [86]:
# function to transform train and test
# pickle fitter quantile_transformer with suffix



import pickle


def quantile_transformation(X_train, X_test, suffix, target_name):
    quantile_transformer = preprocessing.QuantileTransformer(
        output_distribution='normal', random_state=0)
    
    X_train[target_name] = quantile_transformer.fit_transform(X_train[target_name].to_numpy().reshape(-1,1))
    X_test[target_name] = quantile_transformer.transform(X_test[target_name].to_numpy().reshape(-1,1))

    pickle.dump(quantile_transformer, open('../data/qtransformer'+suffix+'.pkl','wb'))
    return X_train, X_test

new_modification_name = 'qt'
old_keys = list(datasets.keys())
for k in old_keys:
    if k.find('train')>-1:
        k_old_train = k
        k_old_test = k.replace('train','test')
        k_train = k+'_'+new_modification_name
        k_test = k_train.replace('train','test')
        suffix = k_train[k_train.find('_')+6:]
        print(suffix)
        datasets[k_train], datasets[k_test] = quantile_transformation(
            datasets[k_old_train].copy(), datasets[k_old_test].copy(), suffix, 'completion_ratio')

datasets.keys()


_qt
_dr_qt


dict_keys(['X_train', 'X_test', 'X_train_dr', 'X_test_dr', 'X_train_qt', 'X_test_qt', 'X_train_dr_qt', 'X_test_dr_qt'])

## 7. Correlation

Let's detect if there are correlated features that can harm the machine learning model performance.

In [87]:
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")


d = X_train

# Compute the correlation matrix
corr = d.corr()

In [88]:
print("Correlation between features")
corr = d.corr()
# construct pairs (corr, (feat, feat)) and then sort them
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(X_train, 15))


Correlation between features
Top Absolute Correlations
feat_06  feat_34    1.000000
feat_36  feat_38    0.999088
feat_18  feat_39    0.930603
feat_19  feat_38    0.905821
         feat_36    0.905540
feat_42  feat_43    0.887645
feat_19  feat_40    0.839575
feat_38  feat_40    0.825255
feat_36  feat_40    0.824710
feat_19  feat_33    0.819799
feat_07  feat_47    0.818301
         feat_30    0.784156
feat_45  feat_46    0.767886
feat_05  feat_43    0.767480
feat_33  feat_38    0.742714
dtype: float64


In [89]:
correlated_features = [
    'feat_06',
    'feat_38', # 36, 19, 40, 33
    'feat_36', # 38, 19, 40
    'feat_19', # 38, 36, 40, 33
    #'feat_40', # 19, 38, 36 -> don't removve
    'feat_43',
    'feat_07', # 47, 30
    'feat_43', # 42, 05
    'feat_46'
]
non_correlated_vars = [col for col in df.columns if col not in correlated_features]


In [90]:
def setup_correlated_vars(X_train, X_test,non_correlated_vars):
    X_train = X_train.loc[:,non_correlated_vars].copy()
    X_test = X_test.loc[:,non_correlated_vars].copy()
    return X_train, X_test

new_modification_name = 'nc'
old_keys = list(datasets.keys())
for k in old_keys:
    if k.find('train')>-1:
        k_old_train = k
        k_old_test = k.replace('train','test')
        k_train = k+'_'+new_modification_name
        k_test = k_train.replace('train','test')
        datasets[k_train], datasets[k_test] = setup_correlated_vars(
            datasets[k_old_train].copy(), datasets[k_old_test].copy(), non_correlated_vars)

datasets.keys()
len(datasets.keys())

16

## 8. Feature Scaling

In [36]:
def scaler_minmax(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return pd.DataFrame(X_train), pd.DataFrame(X_test)

new_modification_name = 'mmxs'
old_keys = list(datasets.keys())
for k in old_keys:
    if k.find('train')>-1:
        k_old_train = k
        k_old_test = k.replace('train','test')
        k_train = k+'_'+new_modification_name
        k_test = k_train.replace('train','test')
        datasets[k_train], datasets[k_test] = scaler_minmax(
            datasets[k_old_train].copy(), 
            datasets[k_old_test].copy())

print(datasets.keys(),len(datasets.keys()))

[[1.         0.         0.         ... 0.         0.00584795 0.535918  ]
 [0.         0.         0.         ... 0.         0.00584795 0.45228417]
 [0.         0.03921569 0.         ... 0.         0.         0.37488955]
 ...
 [1.         0.         0.03333333 ... 0.         0.         0.08025707]
 [1.         0.         0.         ... 0.         0.         0.95917646]
 [0.         0.05882353 0.         ... 0.06015038 0.02046784 0.55924415]]
[[0.         0.         0.         ... 0.         0.00584795 0.535918  ]
 [1.         0.         0.         ... 0.         0.00584795 0.45228417]
 [1.         0.03921569 0.         ... 0.         0.         0.37488955]
 ...
 [0.         0.         0.03333333 ... 0.         0.         0.08025707]
 [0.         0.         0.         ... 0.         0.         0.95917646]
 [1.         0.05882353 0.         ... 0.06015038 0.02046784 0.55924415]]
[[1.         0.         0.         ... 0.         0.00584795 0.50188897]
 [0.         0.         0.         ... 

## 9. Feature Selection

In [91]:
def feature_selection(X_train, X_test):
    sel_ = SelectFromModel(Lasso(alpha=0.00005, random_state=7))

    y_train = X_train.loc[:,'completion_ratio'].to_numpy()
    y_test = X_test.loc[:,'completion_ratio'].to_numpy()
    train_vars = [col for col in X_train.columns if col != 'completion_ratio']
    
    # train Lasso model and select features
    X_train = X_train.loc[:,train_vars].copy()
    sel_.fit(X_train, y_train)
    selected_feats = X_train.columns[(sel_.get_support())]
    
    X_train = X_train.loc[:,selected_feats].copy()
    X_test = X_test.loc[:,selected_feats].copy()

    # let's print some stats
    #print('total features: {}'.format((X_train_ls.shape[1])))
    print('selected features: {}'.format(len(selected_feats)))
    #print('features with coefficients shrank to zero: {}'.format(
    #    np.sum(sel_.estimator_.coef_ == 0)))
    #print(selected_feats)
    #pd.Series(selected_feats).to_csv('../data/selected_features.csv', index=False)

    X_train['completion_ratio'] = y_train
    X_test['completion_ratio'] = y_test
    return X_train, X_test

new_modification_name = 'fs'
old_keys = list(datasets.keys())
for k in old_keys:
    if k.find('train')>-1:
        k_old_train = k
        k_old_test = k.replace('train','test')
        k_train = k+'_'+new_modification_name
        k_test = k_train.replace('train','test')
        print(k_old_train)
        datasets[k_train], datasets[k_test] = feature_selection(
            datasets[k_old_train].copy(), 
            datasets[k_old_test].copy())

print(datasets.keys(),len(datasets.keys()))

X_train
selected features: 41
X_train_dr
selected features: 41
X_train_qt


  positive)


selected features: 45
X_train_dr_qt


  positive)


selected features: 45
X_train_nc
selected features: 39
X_train_dr_nc
selected features: 39
X_train_qt_nc
selected features: 40
X_train_dr_qt_nc
selected features: 40
dict_keys(['X_train', 'X_test', 'X_train_dr', 'X_test_dr', 'X_train_qt', 'X_test_qt', 'X_train_dr_qt', 'X_test_dr_qt', 'X_train_nc', 'X_test_nc', 'X_train_dr_nc', 'X_test_dr_nc', 'X_train_qt_nc', 'X_test_qt_nc', 'X_train_dr_qt_nc', 'X_test_dr_qt_nc', 'X_train_fs', 'X_test_fs', 'X_train_dr_fs', 'X_test_dr_fs', 'X_train_qt_fs', 'X_test_qt_fs', 'X_train_dr_qt_fs', 'X_test_dr_qt_fs', 'X_train_nc_fs', 'X_test_nc_fs', 'X_train_dr_nc_fs', 'X_test_dr_nc_fs', 'X_train_qt_nc_fs', 'X_test_qt_nc_fs', 'X_train_dr_qt_nc_fs', 'X_test_dr_qt_nc_fs']) 32


## 10. Saving processed data

In [92]:
models_list = list(datasets.keys())
pickle.dump(models_list, open('../data/models_list.pkl','wb'))
for k,v in datasets.items():
    v.to_csv('../data/'+k+'.csv', index=False)
    