In [107]:
# import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from scipy import stats

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from xgboost import plot_importance

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

In [96]:
CAMEO_DEU_2015_MAP = {
    '1A': 1,
    '1B': 1,
    '1C': 1,
    '1D': 1,
    '1E': 1,
    '2A': 2,
    '2B': 2,
    '2C': 2,
    '2D': 2,
    '3A': 3,
    '3B': 3,
    '3C': 3,
    '3D': 3,
    '4A': 4,
    '4B': 4,
    '4C': 4,
    '4D': 4,
    '4E': 4,
    '5A': 5,
    '5B': 5,
    '5C': 5,
    '5D': 5,
    '5E': 5,
    '5F': 5,
    '6A': 6,
    '6B': 6,
    '6C': 6,
    '6D': 6,
    '6E': 6,
    '6F': 6,
    '7A': 7,
    '7B': 7,
    '7C': 7,
    '7D': 7,
    '7E': 7,
    '8A': 8,
    '8B': 8,
    '8C': 8,
    '8D': 8,
    '9A': 9,
    '9B': 9,
    '9C': 9,
    '9D': 9,
    '9E': 9
}

PRAEGENDE_JUGENDJAHRE_MAP = {
    1: 0,
    2: 1,
    3: 0,
    4: 1,
    5: 0,
    6: 1,
    7: 1,
    8: 0,
    9: 1,
    10: 0,
    11: 1,
    12: 0,
    13: 1,
    14: 0,
    15: 1
}

In [97]:
train_df = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';', index_col='LNR', low_memory=False)
test_df = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';', index_col='LNR', low_memory=False)
metadata = pd.read_csv('data/metadata.csv')

In [98]:
def reverse_order(val, mx, mn):
    diff_from_low = val - mn 
    return mx - diff_from_low


def default_clean(df, drop_threshold=20, testing=False):
    df_ = df.copy()
    
    print('initial df shape: ', df_.shape)
    keep_features = list(metadata[metadata['keep'] == 1]['feature_name'])
    if 'RESPONSE' in df.columns:
        keep_features.append('RESPONSE')
    df_ = df_[keep_features]
    
    filter_ = df_['CAMEO_DEUG_2015'] != np.nan
    df_.loc[filter_, 'CAMEO_DEUG_2015'] = pd.to_numeric(df_.loc[filter_, 'CAMEO_DEUG_2015'], errors='coerce')
    
    # set zero to negative one where zero means unknown
    unknown_zero_features = list(metadata[metadata['unknown_zero'] == 1]['feature_name'])
    for feature in unknown_zero_features:
        df_.loc[df_[feature] == 0, feature] = -1
        
    # set nine to negative one where nine means unknown
    unknown_nine_features = list(metadata[metadata['unknown_nine'] == 1]['feature_name'])
    for feature in unknown_nine_features:
        df_.loc[df_[feature] == 9, feature] = -1
        
        
    # special cases
    df_['CAMEO_DEUG_2015'].replace('X', np.nan, inplace=True)
    #df_['OST_WEST_KZ'].replace('O', 1, inplace=True)
    #df_['OST_WEST_KZ'].replace('W', 0, inplace=True)    
    df_['CAMEO_DEU_2015'] = df_['CAMEO_DEU_2015'].apply(lambda x: x if x in CAMEO_DEU_2015_MAP else np.nan)
    df_['PRAEGENDE_JUGENDJAHRE'] = df_['PRAEGENDE_JUGENDJAHRE'].apply(lambda x: PRAEGENDE_JUGENDJAHRE_MAP[x] if x in PRAEGENDE_JUGENDJAHRE_MAP else np.nan)
    
    # set -1 (unknown) to np.nan
    df_ = df_.replace(-1, np.nan)
    
    
    # change some numerical columns to categorical for one hot encoding:
    cat_cols = list(metadata.loc[(metadata['type'] == 'categorical') & (metadata['keep'] == 1), 'feature_name'])
    print('cat_cols: ', cat_cols)
    for col in cat_cols:
        if col in df_.columns:
            df_[col] = np.where(df_[col].isnull(), df_[col], df_[col].astype('str'))
    df_ = pd.get_dummies(df_, prefix=cat_cols, columns=cat_cols)
    
    # reverse some cols so higher number = higher feature
    reverse_cols = list(metadata.loc[metadata['needs_reverse']==1, 'feature_name'])
    for col in reverse_cols:
        if col in df_.columns:
            series = df_[col]
            df_[col] = df_[col].apply(reverse_order, args=(np.max(series), np.min(series)))
            
            
    percent_missing = df_.isnull().sum() * 100 / len(df)
    mv_df = pd.DataFrame({'column_name': df_.columns, 'percent_missing': percent_missing})
    mv_cols = mv_df.loc[mv_df['percent_missing'] > drop_threshold]['column_name']    
    df_ = df_.drop(list(mv_cols), axis=1)
    
    if False:
        thresh = int(len(df_.columns) * 0.85)
        if 'RESPONSE' in df_.columns:
            grouped = df_.groupby(df_.RESPONSE)
            pos = grouped.get_group(1)
            neg = grouped.get_group(0)
            neg = neg.dropna(thresh=thresh)
            df_  = neg.append(pos, verify_integrity=True, ignore_index=False)
        else:
            df_ = df_.dropna(thresh=thresh)
        
    #df_ = df_.loc[:, ~df_.columns.str.startswith('KB')]
    
    print('new df shape: ', df_.shape)
    
    return df_

In [99]:
print(np.sum(train_df['RESPONSE'] == 1))
train_df_init = default_clean(train_df, drop_threshold=80)
test_df_init = default_clean(test_df, drop_threshold=80, testing=True)
np.sum(train_df_init['RESPONSE'] == 1)

532
initial df shape:  (42962, 366)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (42962, 377)
initial df shape:  (42833, 365)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (42833, 376)


532

In [100]:
df = pd.read_csv('experimentation/interactions.csv', low_memory=False)
df.head()

Unnamed: 0,feature,composition,mu0,mu1,se0,se1,sd,mu_diff,stds_between
0,AGER_TYP,1,1.698736,1.712991,0.00423,0.03551,0.035761,0.014255,0.398631
1,ALTERSKATEGORIE_GROB,1,3.513077,3.235405,0.004203,0.045459,0.045653,0.277672,6.082289
2,ALTER_HH,1,12.431288,11.65847,0.025466,0.210338,0.211874,0.772818,3.647532
3,ANZ_HAUSHALTE_AKTIV,1,6.652886,5.831435,0.081977,0.56953,0.5754,0.821451,1.427617
4,ANZ_HH_TITEL,1,0.048276,0.066667,0.002037,0.014012,0.014159,0.018391,1.298842


In [101]:
df['scaled_diff'] = df['stds_between'] 
df.loc[df['composition']==2, 'scaled_diff'] = np.sqrt(df.loc[df['composition']==2,'scaled_diff'])
df = df.sort_values('scaled_diff', ascending=False)
scaled_df = df.reset_index(drop=True)

In [102]:
scaled_df.head()

Unnamed: 0,feature,composition,mu0,mu1,se0,se1,sd,mu_diff,stds_between,scaled_diff
0,D19_KONSUMTYP_9.0,1,0.179843,0.035714,0.002094,0.008053,0.008321,0.144129,17.320914,17.320914
1,KBA05_SEG6_0.0,1,0.86358,0.642857,0.001871,0.020794,0.020878,0.220722,10.57217,10.57217
2,NATIONALITAET_KZ_1.0,1,0.956607,0.802632,0.001111,0.017272,0.017308,0.153975,8.896222,8.896222
3,FINANZ_UNAUFFAELLIGER,1,4.324496,3.766917,0.005152,0.062659,0.06287,0.557579,8.868726,8.868726
4,SEMIO_REL,1,5.228081,4.582707,0.008403,0.090337,0.090727,0.645374,7.113352,7.113352


In [103]:
def get_top_n(n):
    return scaled_df.loc[0:n,['feature', 'composition']].copy()

In [118]:
def get_initial_sets(top_n=None):

    X_train_full = train_df_init.copy()
    y_train_full = X_train_full['RESPONSE']

    inter_cols = list(set.intersection(set(X_train_full.columns), set(test_df_init.columns)))

    X_train_full = X_train_full[inter_cols]

    # "Cardinality" means the number of unique values in a column
    # Select categorical columns with relatively low cardinality (convenient but arbitrary)
    categorical_cols = [cname for cname in X_train_full.columns if
                        X_train_full[cname].nunique() < 15 and
                        X_train_full[cname].nunique() >= 2 and
                        X_train_full[cname].dtype == "object"]


    # Select numerical columns
    numerical_cols = [cname for cname in X_train_full.columns if 
                    X_train_full[cname].dtype in ['int64', 'float64', 'uint8']]
    
    if top_n:
        features = get_top_n(top_n)
        numerical_cols = []
        for ix, row in features.iterrows():
            comp = row['composition']
            f = row['feature']
            numerical_cols.append(f)
            if comp == 2:
                fs = f.split(':')
                f0 = fs[0]
                f1 = fs[1]
                test_df_init[f] = test_df_init[f0] * test_df_init[f1]
                train_df_init[f] = train_df_init[f0] * train_df_init[f1]
                

                

    # Keep selected columns only
    my_cols = numerical_cols + categorical_cols

    # supervised testing and full datasets
    X_test = test_df_init[my_cols].copy()
    X_total_train = train_df_init[my_cols].copy()
    y_total_train = train_df_init['RESPONSE'].copy()

    assert(list(X_total_train.columns) == list(X_test.columns))
    
    return X_total_train, y_total_train, X_test, numerical_cols, categorical_cols

In [125]:
X_total_train, y_total_train, X_test, numerical_cols, categorical_cols = get_initial_sets(500)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
   # ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [120]:
model = LogisticRegression(class_weight='auto', max_iter=300)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('scaling', MinMaxScaler()),
                              ('pca', PCA(n_components=450)),
                              ('model', model)
                     ])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = cross_val_score(my_pipeline, X_total_train, y_total_train,
                              cv=5,
                              scoring='roc_auc')

print('Mean AUC:', scores.mean())

# has AUC of 0.6283

Mean AUC: 0.7319654249784922


In [121]:
models = [xgb.XGBClassifier(scale_pos_weight=180, n_jobs=-1), BalancedRandomForestClassifier(n_jobs=-1), 
          BalancedBaggingClassifier(), HistGradientBoostingClassifier()]

for model in models:

    print(model)
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)
                         ])
    
    scores = cross_val_score(my_pipeline, X_total_train, y_total_train,
                              cv=5,
                              scoring='roc_auc')
    
    print('Mean AUC:', scores.mean())

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=180, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
Mean AUC: 0.6705251220666832
BalancedRandomForestClassifier(n_jobs=-1)
Mean AUC: 0.6540242731989659
BalancedBaggingClassifier()
Mean AUC: 0.6028139982486117
HistGradientBoostingClassifier()
Mean AUC: 0.6371716370170076


In [127]:
model = xgb.XGBClassifier(scale_pos_weight=180,
                          n_jobs=-1,
                          alpha=440,
                          reg_lambda=313,
                          min_child_weight=3,
                          max_depth=15,
                          learning_rate=0.3,
                          subsample=0.87,
                          max_delta_step=7,
                          n_estimators=110,
                          gamma=6.83,
                          colsample_bylevel=0.35,
                          colsample_bynode=0.13,
                          colsample_bytree=0.56)
                          


my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                     ])

scores = cross_val_score(my_pipeline, X_total_train, y_total_train,
                          cv=5,
                          scoring='roc_auc')

print('Mean AUC:', scores.mean())

Mean AUC: 0.657969803116966


In [128]:
model = xgb.XGBClassifier(scale_pos_weight=180,
                          n_jobs=-1,
                          alpha=440,
                          reg_lambda=313,
                          min_child_weight=3,
                          max_depth=15,
                          learning_rate=0.3,
                          subsample=0.87,
                          max_delta_step=7,
                          n_estimators=110,
                          gamma=6.83,
                          colsample_bylevel=0.35,
                          colsample_bynode=0.13,
                          colsample_bytree=0.56)


my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_total_fit = my_pipeline.fit_transform(X_total_train)
model.fit(X_total_fit, y_total_train)
                       
X_test_fit = my_pipeline.transform(X_test)
preds = model.predict_proba(X_test_fit)
print(preds.shape)

(42833, 2)


In [129]:
preds_0 = preds[:,1]
tester_df = pd.DataFrame(preds_0, index=X_test.index, columns=['RESPONSE'])
print(tester_df.head())
tester_df.to_csv('predictions.csv')

      RESPONSE
LNR           
1754  0.708098
1770  0.797928
1465  0.192424
1470  0.188132
1478  0.711915
