# Feature engineering

The goal of this part is to find useful features to improve our predictions using the libraries featuretools or autofeat

In [1]:
# import numpy as np
import pandas as pd
from autofeat import AutoFeatClassifier
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
def numerical_impute(data, numerical_list):
    imputer_numerical = SimpleImputer(strategy='constant', fill_value=-1, missing_values=np.nan)
    data_numerical = data.loc[:, numerical_list]
    data_numerical_imputed = imputer_numerical.fit_transform(data_numerical)
    data_numerical_imputed = pd.DataFrame(data_numerical_imputed, columns=numerical_list)
    return data_numerical_imputed

def categorical_impute_encode1(data, categorical_list):
    # Imputing
    imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical = pd.DataFrame(data_categorical, columns=categorical_list)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_categorical_encoded = ordinal_encoder.fit_transform(data_categorical)
    data_categorical_encoded = pd.DataFrame(data_categorical_encoded, columns=categorical_list)
    return data_categorical_encoded

def categorical_impute_encode2(data, categorical_list_one_hot, categorical_list_ordinal):
    # Imputing
    imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list_one_hot + categorical_list_ordinal]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical = pd.DataFrame(data_categorical, columns=categorical_list_one_hot + categorical_list_ordinal)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_categorical_ordinal = ordinal_encoder.fit_transform(data_categorical.loc[:, categorical_list_ordinal])
    data_categorical_ordinal = pd.DataFrame(data_categorical_ordinal, columns=categorical_list_ordinal)

    # One hot encoding
    data_categorical_one_hot = pd.get_dummies(data_categorical.loc[:, categorical_list_one_hot])

    data_categorical_encoded = pd.merge(data_categorical_ordinal, data_categorical_one_hot, left_index=True, right_index=True)

    return data_categorical_encoded

def data_clean(data, numerical_list, categorical_list_one_hot, categorical_list_ordinal):
    # Changer les listes de features et les fonctions correspondantes
    data = data.drop("respondent_id", axis=1)
    data_categorical_encoded = categorical_impute_encode2(data, categorical_list_one_hot, categorical_list_ordinal)
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_imputed_encoded = pd.merge(data_numerical_imputed, data_categorical_encoded, left_index=True, right_index=True)

    return data_imputed_encoded

Import des données
=========

In [3]:
FEATURES_TRAINING_PATH = "training_set_features.csv"
LABELS_TRAINING_PATH = "training_set_labels.csv"

In [4]:
features = pd.read_csv(FEATURES_TRAINING_PATH, sep=",", header=0)
labels = pd.read_csv(LABELS_TRAINING_PATH, sep=",", header=0)
data_original = pd.merge(features, labels, on="respondent_id")
respondent_id = data_original['respondent_id']

In [5]:
data = data_original.copy()

In [6]:
arg_list = list(data.keys())
features_list = arg_list.copy()
features_list.remove("h1n1_vaccine")
features_list.remove("seasonal_vaccine")
features_list.remove("respondent_id")

labels_list = ['h1n1_vaccine', 'seasonal_vaccine']

categorical_list = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa','employment_industry', 'employment_occupation']

categorical_list_one_hot = ['race', 'sex', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

categorical_list_ordinal = [k for k in categorical_list if k not in categorical_list_one_hot]

numerical_list = [k for k in features_list if k not in categorical_list]

In [8]:
data = data_clean(data, numerical_list, categorical_list_one_hot, categorical_list_ordinal)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [17]:
data = pd.merge(respondent_id, data, right_index=True, left_index=True) # On remet respondent_id pour un test avec featuretools

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


Featuretools
=======

In [24]:
es = ft.EntitySet(id="dataset")

In [25]:
es = es.add_dataframe(dataframe_name="data", dataframe=data, index="respondent_id")



In [23]:
# es = es.normalize_dataframe(base_dataframe_name="data", new_dataframe_name="respondent", index="respondent_id")

ValueError: 'index' must be different from the index column of the base dataframe

In [26]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]

In [None]:
best_trans_primitives = ['add_numeric', 'multiply_numeric', 'less_than_scalar', 'divide_numeric', 'greater_than_scalar', 'rolling_std', 'cum_mean', 'cum_count']

In [28]:
feature_names = ft.dfs(
    entityset=es,
    target_dataframe_name="data",
    max_depth=2,
    trans_primitives=default_agg_primitives,
    features_only=True,
)
feature_names

  agg_primitives: ['count', 'max', 'mean', 'min', 'mode', 'num_unique', 'percent_true', 'skew', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data.


[<Feature: h1n1_concern>,
 <Feature: h1n1_knowledge>,
 <Feature: behavioral_antiviral_meds>,
 <Feature: behavioral_avoidance>,
 <Feature: behavioral_face_mask>,
 <Feature: behavioral_wash_hands>,
 <Feature: behavioral_large_gatherings>,
 <Feature: behavioral_outside_home>,
 <Feature: behavioral_touch_face>,
 <Feature: doctor_recc_h1n1>,
 <Feature: doctor_recc_seasonal>,
 <Feature: chronic_med_condition>,
 <Feature: child_under_6_months>,
 <Feature: health_worker>,
 <Feature: health_insurance>,
 <Feature: opinion_h1n1_vacc_effective>,
 <Feature: opinion_h1n1_risk>,
 <Feature: opinion_h1n1_sick_from_vacc>,
 <Feature: opinion_seas_vacc_effective>,
 <Feature: opinion_seas_risk>,
 <Feature: opinion_seas_sick_from_vacc>,
 <Feature: household_adults>,
 <Feature: household_children>,
 <Feature: age_group>,
 <Feature: education>,
 <Feature: income_poverty>,
 <Feature: race_Black>,
 <Feature: race_Hispanic>,
 <Feature: race_Other or Multiple>,
 <Feature: race_White>,
 <Feature: sex_Female>,
 <Fe

In [29]:
ft.primitives.list_primitives()[ft.primitives.list_primitives()['type'] == 'transform']

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
22,less_than_equal_to_scalar,transform,True,True,Determines if values are less than or equal to...,"<ColumnSchema (Logical Type = Ordinal)>, <Colu...",
23,url_to_domain,transform,False,False,Determines the domain of a url.,<ColumnSchema (Logical Type = URL)>,
24,absolute,transform,True,True,Computes the absolute value of a number.,<ColumnSchema (Semantic Tags = ['numeric'])>,
25,num_characters,transform,True,True,Calculates the number of characters in a string.,<ColumnSchema (Logical Type = NaturalLanguage)>,
26,less_than_scalar,transform,True,True,Determines if values are less than a given sca...,"<ColumnSchema (Logical Type = Ordinal)>, <Colu...",
...,...,...,...,...,...,...,...
85,url_to_tld,transform,False,False,Determines the top level domain of a url.,<ColumnSchema (Logical Type = URL)>,
86,week,transform,True,True,Determines the week of the year from a datetime.,<ColumnSchema (Logical Type = Datetime)>,
87,divide_by_feature,transform,True,True,Divide a scalar by each value in the list.,<ColumnSchema (Semantic Tags = ['numeric'])>,
88,rolling_mean,transform,False,False,Calculates the mean of entries over a given wi...,<ColumnSchema (Logical Type = Datetime) (Seman...,


In [45]:
ft.primitives.list_primitives()[ft.primitives.list_primitives()['type'] == 'aggregation']

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
0,std,aggregation,True,True,Computes the dispersion relative to the mean v...,<ColumnSchema (Semantic Tags = ['numeric'])>,
1,mean,aggregation,True,True,Computes the average for a list of values.,<ColumnSchema (Semantic Tags = ['numeric'])>,
2,max,aggregation,True,True,"Calculates the highest value, ignoring `NaN` v...",<ColumnSchema (Semantic Tags = ['numeric'])>,
3,percent_true,aggregation,True,False,Determines the percent of `True` values.,<ColumnSchema (Logical Type = BooleanNullable)...,
4,time_since_first,aggregation,False,False,Calculates the time elapsed since the first da...,<ColumnSchema (Logical Type = Datetime) (Seman...,
5,any,aggregation,True,False,Determines if any value is 'True' in a list.,<ColumnSchema (Logical Type = BooleanNullable)...,
6,sum,aggregation,True,True,"Calculates the total addition, ignoring `NaN`.",<ColumnSchema (Semantic Tags = ['numeric'])>,
7,num_unique,aggregation,True,True,"Determines the number of distinct values, igno...",<ColumnSchema (Semantic Tags = ['category'])>,
8,last,aggregation,False,False,Determines the last value in a list.,<ColumnSchema>,
9,all,aggregation,True,False,Calculates if all values are 'True' in a list.,<ColumnSchema (Logical Type = BooleanNullable)...,


Autofeat
======

In [7]:
from autofeat import AutoFeatClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
def categorical_imputing(data, categorical_list):
    # Imputing
    imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical_imputed = pd.DataFrame(data_categorical, columns=categorical_list)
    return data_categorical_imputed

def autofeat_test(data, labels, numerical_list, categorical_list, featen_steps=2):

    data = data.drop("respondent_id", axis=1)
    labels = labels.to_numpy()

    # Imputing data
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    #Scaling numerical data
    scaler = StandardScaler()
    data_numerical_scaled = scaler.fit_transform(data_numerical_imputed.to_numpy())
    data_numerical_scaled = pd.DataFrame(data_numerical_scaled, columns=numerical_list)

    # Merging data
    data_imputed_scaled = pd.merge(data_numerical_scaled, data_categorical_imputed, left_index=True, right_index=True)

    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(data_imputed_scaled, labels, test_size=0.2, random_state=1)

    # Autofeat classifier
    model = AutoFeatClassifier(categorical_cols=categorical_list, feateng_steps=featen_steps, verbose=1)
    X_train_tf = model.fit_transform(X_train, np.ravel(y_train))
    X_test_tf = model.transform(X_test)

    X_test_tf.to_csv("new_features.csv", sep=',', header=True, index=False)

    # On sauvegarde le modèle permettant de générer les nouvelles features ainsi que les colonnes générées
    joblib.dump(model, "save_model_test.pkl")
    dic_model = {
        'model' : model,
        'columns' : X_train_tf.columns
    }
    joblib.dump(dic_model, "save_model_and_dict_test.pkl")

    return X_test_tf

In [9]:
# Data with labels included
data_feat = data_original.copy()
labels = data_feat.loc[:, ['h1n1_vaccine']]
data_feat.drop(labels_list, inplace=True, axis=1)

In [27]:
X_new = autofeat_test(data_feat.iloc[:100], labels[:100], ['h1n1_concern', 'h1n1_knowledge'], ['age_group', 'sex'])

[AutoFeat] The 2 step feature engineering process could generate up to 2016 features.
[AutoFeat] With 80 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 10 transformed features from 9 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 152 feature combinations from 171 original feature tuples - done.
[feateng] Generated altogether 174 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 147 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 0 features after noise filtering
[AutoFeat] Final dataframe with 9 feature columns (5 new).
[AutoFeat] Training final classification model.
[Au

Résultat du test avec autofeat :
* Avec 2 de profondeur : 307 720 nouvelles features possibles
* Probablement des heures de calcule sur la sélection des features : faire tourner sur zeus avec un script python

In [17]:
def train_data_transform(model, data, numerical_list, categorical_list):

    data = data.drop("respondent_id", axis=1)

    # Imputing data
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    #Scaling numerical data
    scaler = StandardScaler()
    data_numerical_scaled = scaler.fit_transform(data_numerical_imputed.to_numpy())
    data_numerical_scaled = pd.DataFrame(data_numerical_scaled, columns=numerical_list)

    # Merging data
    data_imputed_scaled = pd.merge(data_numerical_scaled, data_categorical_imputed, left_index=True, right_index=True)

    # Autofeat classifier
    X = model.transform(data_imputed_scaled)

    return X

In [10]:
af_clf = joblib.load("autoFeatModel.save")

In [12]:
data_test_tf = data_original.copy()

In [15]:
label = data_test_tf.loc[:, ['h1n1_vaccine']]

In [18]:
new_data_test = train_data_transform(af_clf, data_test_tf, numerical_list, categorical_list)

[AutoFeat] Computing 41 new features.
[AutoFeat]    41/   41 new features ...done.


In [20]:
pred = af_clf.predict(new_data_test)

In [23]:
new_data_test.to_csv("new_features_train.csv", sep=",", header=True, index=False)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from autofeat import AutoFeatClassifier

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(new_data_test, label, train_size=0.2, random_state=1)

In [26]:
cat_clf = CatBoostClassifier()
cat_clf.fit(X_train, Y_train)

Learning rate set to 0.021068
0:	learn: 0.6763968	total: 171ms	remaining: 2m 50s
1:	learn: 0.6607216	total: 181ms	remaining: 1m 30s
2:	learn: 0.6456015	total: 190ms	remaining: 1m 3s
3:	learn: 0.6313420	total: 201ms	remaining: 49.9s
4:	learn: 0.6179313	total: 211ms	remaining: 42s
5:	learn: 0.6051167	total: 220ms	remaining: 36.4s
6:	learn: 0.5927792	total: 227ms	remaining: 32.2s
7:	learn: 0.5825314	total: 235ms	remaining: 29.2s
8:	learn: 0.5716520	total: 243ms	remaining: 26.8s
9:	learn: 0.5612392	total: 249ms	remaining: 24.6s
10:	learn: 0.5505252	total: 254ms	remaining: 22.9s
11:	learn: 0.5413016	total: 259ms	remaining: 21.3s
12:	learn: 0.5326230	total: 263ms	remaining: 20s
13:	learn: 0.5252001	total: 268ms	remaining: 18.9s
14:	learn: 0.5169910	total: 273ms	remaining: 18s
15:	learn: 0.5086724	total: 278ms	remaining: 17.1s
16:	learn: 0.5008929	total: 283ms	remaining: 16.3s
17:	learn: 0.4937468	total: 287ms	remaining: 15.7s
18:	learn: 0.4873717	total: 292ms	remaining: 15.1s
19:	learn: 0.48

<catboost.core.CatBoostClassifier at 0x1ceead1c850>

In [28]:
cat_pred = cat_clf.predict_proba(X_test)[:, 1]
roc_auc_score(Y_test, cat_pred)

0.8610343398269936

In [31]:
rnd_clf = RandomForestClassifier(n_estimators=300)
rnd_clf.fit(X_train, np.ravel(Y_train))



RandomForestClassifier(n_estimators=300)

In [33]:
rnd_clf_pred = rnd_clf.predict_proba(X_test)[:, 1]
roc_auc_score(Y_test, rnd_clf_pred)



0.8552646899318992

In [35]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, np.ravel(Y_train))



ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:h1n1_concern, h1n1_knowledge, behavioral_antiviral_meds, behavioral_avoidance, behavioral_face_mask, behavioral_wash_hands, behavioral_large_gatherings, behavioral_outside_home, behavioral_touch_face, doctor_recc_h1n1, doctor_recc_seasonal, chronic_med_condition, child_under_6_months, health_worker, health_insurance, opinion_h1n1_vacc_effective, opinion_h1n1_risk, opinion_h1n1_sick_from_vacc, opinion_seas_vacc_effective, opinion_seas_risk, opinion_seas_sick_from_vacc, household_adults, household_children