In [1]:
# import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from scipy import stats

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from xgboost import plot_importance

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
CAMEO_DEU_2015_MAP = {
    '1A': 1,
    '1B': 1,
    '1C': 1,
    '1D': 1,
    '1E': 1,
    '2A': 2,
    '2B': 2,
    '2C': 2,
    '2D': 2,
    '3A': 3,
    '3B': 3,
    '3C': 3,
    '3D': 3,
    '4A': 4,
    '4B': 4,
    '4C': 4,
    '4D': 4,
    '4E': 4,
    '5A': 5,
    '5B': 5,
    '5C': 5,
    '5D': 5,
    '5E': 5,
    '5F': 5,
    '6A': 6,
    '6B': 6,
    '6C': 6,
    '6D': 6,
    '6E': 6,
    '6F': 6,
    '7A': 7,
    '7B': 7,
    '7C': 7,
    '7D': 7,
    '7E': 7,
    '8A': 8,
    '8B': 8,
    '8C': 8,
    '8D': 8,
    '9A': 9,
    '9B': 9,
    '9C': 9,
    '9D': 9,
    '9E': 9
}

PRAEGENDE_JUGENDJAHRE_MAP = {
    1: 0,
    2: 1,
    3: 0,
    4: 1,
    5: 0,
    6: 1,
    7: 1,
    8: 0,
    9: 1,
    10: 0,
    11: 1,
    12: 0,
    13: 1,
    14: 0,
    15: 1
}

In [3]:
train_df = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';', index_col='LNR', low_memory=False)
test_df = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';', index_col='LNR', low_memory=False)
metadata = pd.read_csv('data/metadata.csv')

In [4]:
def reverse_order(val, mx, mn):
    diff_from_low = val - mn 
    return mx - diff_from_low


def default_clean(df, drop_threshold=20, testing=False):
    df_ = df.copy()
    
    print('initial df shape: ', df_.shape)
    keep_features = list(metadata[metadata['keep'] == 1]['feature_name'])
    if 'RESPONSE' in df.columns:
        keep_features.append('RESPONSE')
    df_ = df_[keep_features]
    
    filter_ = df_['CAMEO_DEUG_2015'] != np.nan
    df_.loc[filter_, 'CAMEO_DEUG_2015'] = pd.to_numeric(df_.loc[filter_, 'CAMEO_DEUG_2015'], errors='coerce')
    
    # set zero to negative one where zero means unknown
    unknown_zero_features = list(metadata[metadata['unknown_zero'] == 1]['feature_name'])
    for feature in unknown_zero_features:
        df_.loc[df_[feature] == 0, feature] = -1
        
    # set nine to negative one where nine means unknown
    unknown_nine_features = list(metadata[metadata['unknown_nine'] == 1]['feature_name'])
    for feature in unknown_nine_features:
        df_.loc[df_[feature] == 9, feature] = -1
        
        
    # special cases
    df_['CAMEO_DEUG_2015'].replace('X', np.nan, inplace=True)
    #df_['OST_WEST_KZ'].replace('O', 1, inplace=True)
    #df_['OST_WEST_KZ'].replace('W', 0, inplace=True)    
    df_['CAMEO_DEU_2015'] = df_['CAMEO_DEU_2015'].apply(lambda x: x if x in CAMEO_DEU_2015_MAP else np.nan)
    df_['PRAEGENDE_JUGENDJAHRE'] = df_['PRAEGENDE_JUGENDJAHRE'].apply(lambda x: PRAEGENDE_JUGENDJAHRE_MAP[x] if x in PRAEGENDE_JUGENDJAHRE_MAP else np.nan)
    
    # set -1 (unknown) to np.nan
    df_ = df_.replace(-1, np.nan)
    
    
    # change some numerical columns to categorical for one hot encoding:
    cat_cols = list(metadata.loc[(metadata['type'] == 'categorical') & (metadata['keep'] == 1), 'feature_name'])
    print('cat_cols: ', cat_cols)
    for col in cat_cols:
        if col in df_.columns:
            df_[col] = np.where(df_[col].isnull(), df_[col], df_[col].astype('str'))

    df_ = pd.get_dummies(df_, prefix=cat_cols, columns=cat_cols)
    
    
    # reverse some cols so higher number = higher feature
    reverse_cols = list(metadata.loc[metadata['needs_reverse']==1, 'feature_name'])
    for col in reverse_cols:
        if col in df_.columns:
            series = df_[col]
            df_[col] = df_[col].apply(reverse_order, args=(np.max(series), np.min(series)))
            
            
    percent_missing = df_.isnull().sum() * 100 / len(df)
    mv_df = pd.DataFrame({'column_name': df_.columns, 'percent_missing': percent_missing})
    mv_cols = mv_df.loc[mv_df['percent_missing'] > drop_threshold]['column_name']    
    df_ = df_.drop(list(mv_cols), axis=1)
    
    if False:
        thresh = int(len(df_.columns) * 0.85)
        grouped = df_.groupby(df_.RESPONSE)
        pos = grouped.get_group(1)
        neg = grouped.get_group(0)
        neg = neg.dropna(thresh=thresh)
        df_  = neg.append(pos, verify_integrity=True, ignore_index=False)
        
    #df_ = df_.loc[:, ~df_.columns.str.startswith('KB')]
    
    print('new df shape: ', df_.shape)
    
    return df_

In [5]:
print(np.sum(train_df['RESPONSE'] == 1))
train_df_init = default_clean(train_df, drop_threshold=50)
test_df_init = default_clean(test_df, drop_threshold=50, testing=True)
np.sum(train_df_init['RESPONSE'] == 1)

532
initial df shape:  (42962, 366)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (42962, 376)
initial df shape:  (42833, 365)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (42833, 375)


532

In [11]:
def get_initial_sets(cols=None):

    if cols:
        inter_cols = list(set.intersection(set(train_df_init.columns), set(test_df_init.columns), set(cols)))
    else:
        inter_cols = list(set.intersection(set(train_df_init.columns), set(test_df_init.columns)))
        

    X_train_full = train_df_init[inter_cols]

    # "Cardinality" means the number of unique values in a column
    # Select categorical columns with relatively low cardinality (convenient but arbitrary)
    categorical_cols = [cname for cname in X_train_full.columns if
                        X_train_full[cname].nunique() < 15 and
                        X_train_full[cname].nunique() >= 2 and
                        X_train_full[cname].dtype == "object"]


    # Select numerical columns
    numerical_cols = [cname for cname in X_train_full.columns if 
                    X_train_full[cname].dtype in ['int64', 'float64', 'uint8']]
    

    # Keep selected columns only
    my_cols = numerical_cols + categorical_cols

    # supervised testing and full datasets
    X_test = test_df_init[my_cols].copy()
    X_total_train = train_df_init[my_cols].copy()
    y_total_train = train_df_init['RESPONSE'].copy()

    assert(list(X_total_train.columns) == list(X_test.columns))
    
    return X_total_train, y_total_train, X_test, numerical_cols, categorical_cols

In [16]:
special_cols = list(set(['LP_STATUS_GROB', 'GREEN_AVANTGARDE', 'EWDICHTE', 'GEBURTSJAHR', 'INNENSTADT',
              'SEMIO_KULT', 'SEMIO_SOZ', 'WOHNLAGE', 'ZABEOTYP', 'HH_EINKOMMEN_SCORE', 'FINANZ_VORSORGER', 
              'D19_VERSAND_ONLINE_DATUM', 'CAMEO_DEUG_2015', 'BALLRAUM', 'ALTER_HH', 'D19_KONSUMTYP', 
              'D19_VERSAND_ANZ_12', 'D19_VERSI_ANZ_24', 'FINANZTYP', 'GEBAEUDETYP', 'GFK_URLAUBERTYP', 'HEALTH_TYP',
              'LP_FAMILIE_GROB', 'LP_LEBENSPHASE_GROB', 'NATIONALITAET_KZ', 'ORTSGR_KLS9', 'SEMIO_MAT', 'TITEL_KZ',
              'VERS_TYP', 'OST_WEST_KZ_0.0', 'OST_WEST_KZ_1.0', 'ANREDE_KZ_1', 'ANREDE_KZ_2', 'PRAEGENDE_JUGENDJAHRE_0.0',
              'PRAEGENDE_JUGENDJAHRE_1.0', 'SEMIO_FAM', 'OST_WEST_KZ_O', 'OST_WEST_KZ_W' 
              ]))


In [25]:
train_cluster_feature = pd.read_csv('experimentation/train_cluster_1_or_5.csv', index_col='LNR')
test_cluster_feature = pd.read_csv('experimentation/test_cluster_1_or_5.csv', index_col='LNR')
train_cluster_feature.head()

Unnamed: 0_level_0,cluster
LNR,Unnamed: 1_level_1
1763,0
1771,0
1776,1
1460,1
1783,0


In [26]:
X_total_train, y_total_train, X_test, numerical_cols, categorical_cols = get_initial_sets(special_cols)
X_total_train = X_total_train.join(train_cluster_feature)
X_test = X_test.join(test_cluster_feature)

numerical_cols.append('cluster')
X_total_train.head()

Unnamed: 0_level_0,WOHNLAGE,GEBURTSJAHR,OST_WEST_KZ_O,LP_STATUS_GROB,HH_EINKOMMEN_SCORE,CAMEO_DEUG_2015,SEMIO_FAM,FINANZTYP,ALTER_HH,PRAEGENDE_JUGENDJAHRE_0.0,...,SEMIO_KULT,PRAEGENDE_JUGENDJAHRE_1.0,BALLRAUM,ANREDE_KZ_1,ANREDE_KZ_2,EWDICHTE,ORTSGR_KLS9,SEMIO_SOZ,SEMIO_MAT,cluster
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1763,6.0,,0,2.0,1.0,5.0,6,1,8.0,1,...,7,0,5.0,0,1,5.0,7.0,3,7,0
1771,2.0,1957.0,0,4.0,6.0,5.0,7,1,13.0,1,...,5,0,5.0,0,1,1.0,2.0,7,3,0
1776,7.0,1929.0,1,5.0,6.0,8.0,4,2,9.0,0,...,4,1,1.0,1,0,6.0,8.0,5,2,1
1460,8.0,1924.0,0,2.0,3.0,8.0,7,2,6.0,0,...,7,1,2.0,0,1,6.0,9.0,3,7,1
1783,6.0,1936.0,0,3.0,3.0,3.0,2,2,9.0,1,...,3,0,4.0,1,0,5.0,7.0,2,2,0


In [27]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
   # ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [28]:
# Define Benchmark Model
model = LogisticRegression()
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('scaling', MinMaxScaler()),
                              ('pca', PCA(n_components=20)),
                              ('model', model)
                     ])


scores = cross_val_score(my_pipeline, X_total_train, y_total_train,
                              cv=5,
                              scoring='roc_auc')

print('Mean AUC:', scores.mean())



# has AUC of 0.6283

Mean AUC: 0.5664724867824039


In [29]:
models = [xgb.XGBClassifier(scale_pos_weight=180, n_jobs=-1), BalancedRandomForestClassifier(n_jobs=-1), 
          BalancedBaggingClassifier(), HistGradientBoostingClassifier()]

for model in models:

    print(model)
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                         ])
    
    scores = cross_val_score(my_pipeline, X_total_train, y_total_train,
                              cv=5,
                              scoring='roc_auc')
    
    print('Mean AUC:', scores.mean())

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=180, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
Mean AUC: 0.5236033860036797
BalancedRandomForestClassifier(n_jobs=-1)
Mean AUC: 0.5696021075175921
BalancedBaggingClassifier()
Mean AUC: 0.5393040153306952
HistGradientBoostingClassifier()
Mean AUC: 0.5452600064923804
