In [1]:
# import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier

from xgboost import plot_importance

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
CAMEO_DEU_2015_MAP = {
    '1A': 1,
    '1B': 1,
    '1C': 1,
    '1D': 1,
    '1E': 1,
    '2A': 2,
    '2B': 2,
    '2C': 2,
    '2D': 2,
    '3A': 3,
    '3B': 3,
    '3C': 3,
    '3D': 3,
    '4A': 4,
    '4B': 4,
    '4C': 4,
    '4D': 4,
    '4E': 4,
    '5A': 5,
    '5B': 5,
    '5C': 5,
    '5D': 5,
    '5E': 5,
    '5F': 5,
    '6A': 6,
    '6B': 6,
    '6C': 6,
    '6D': 6,
    '6E': 6,
    '6F': 6,
    '7A': 7,
    '7B': 7,
    '7C': 7,
    '7D': 7,
    '7E': 7,
    '8A': 8,
    '8B': 8,
    '8C': 8,
    '8D': 8,
    '9A': 9,
    '9B': 9,
    '9C': 9,
    '9D': 9,
    '9E': 9
}

PRAEGENDE_JUGENDJAHRE_MAP = {
    1: 0,
    2: 1,
    3: 0,
    4: 1,
    5: 0,
    6: 1,
    7: 1,
    8: 0,
    9: 1,
    10: 0,
    11: 1,
    12: 0,
    13: 1,
    14: 0,
    15: 1
}

In [3]:
train_df = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';', index_col='LNR')
test_df = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';', index_col='LNR')
metadata = pd.read_csv('data/metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def reverse_order(val, mx, mn):
    diff_from_low = val - mn 
    return mx - diff_from_low


def default_clean(df, drop_threshold=20, testing=False):
    df_ = df.copy()
    
    print('initial df shape: ', df_.shape)
    keep_features = list(metadata[metadata['keep'] == 1]['feature_name'])
    if 'RESPONSE' in df.columns:
        keep_features.append('RESPONSE')
    df_ = df_[keep_features]
    
    filter_ = df_['CAMEO_DEUG_2015'] != np.nan
    df_.loc[filter_, 'CAMEO_DEUG_2015'] = pd.to_numeric(df_.loc[filter_, 'CAMEO_DEUG_2015'], errors='coerce')
    
    # set zero to negative one where zero means unknown
    unknown_zero_features = list(metadata[metadata['unknown_zero'] == 1]['feature_name'])
    for feature in unknown_zero_features:
        df_.loc[df_[feature] == 0, feature] = -1
        
    # set nine to negative one where nine means unknown
    unknown_nine_features = list(metadata[metadata['unknown_nine'] == 1]['feature_name'])
    for feature in unknown_nine_features:
        df_.loc[df_[feature] == 9, feature] = -1
        
        
    # special cases
    df_['CAMEO_DEUG_2015'].replace('X', np.nan, inplace=True)
    #df_['OST_WEST_KZ'].replace('O', 1, inplace=True)
    #df_['OST_WEST_KZ'].replace('W', 0, inplace=True)    
    df_['CAMEO_DEU_2015'] = df_['CAMEO_DEU_2015'].apply(lambda x: x if x in CAMEO_DEU_2015_MAP else np.nan)
    df_['PRAEGENDE_JUGENDJAHRE'] = df_['PRAEGENDE_JUGENDJAHRE'].apply(lambda x: PRAEGENDE_JUGENDJAHRE_MAP[x] if x in PRAEGENDE_JUGENDJAHRE_MAP else np.nan)
    
    # set -1 (unknown) to np.nan
    df_ = df_.replace(-1, np.nan)
    
    
    # change some numerical columns to categorical for one hot encoding:
    cat_cols = list(metadata.loc[(metadata['type'] == 'categorical') & (metadata['keep'] == 1), 'feature_name'])
    print('cat_cols: ', cat_cols)
    for col in cat_cols:
        if col in df_.columns:
            df_[col] = np.where(df_[col].isnull(), df_[col], df_[col].astype('str'))
    df_ = pd.get_dummies(df_, prefix=cat_cols, columns=cat_cols)
    
    # reverse some cols so higher number = higher feature
    reverse_cols = list(metadata.loc[metadata['needs_reverse']==1, 'feature_name'])
    for col in reverse_cols:
        if col in df_.columns:
            series = df_[col]
            df_[col] = df_[col].apply(reverse_order, args=(np.max(series), np.min(series)))
            
            
    percent_missing = df_.isnull().sum() * 100 / len(df)
    mv_df = pd.DataFrame({'column_name': df_.columns, 'percent_missing': percent_missing})
    mv_cols = mv_df.loc[mv_df['percent_missing'] > drop_threshold]['column_name']    
    df_ = df_.drop(list(mv_cols), axis=1)
    
    if True:
        thresh = int(len(df_.columns) * 0.85)
        if 'RESPONSE' in df_.columns:
            grouped = df_.groupby(df_.RESPONSE)
            pos = grouped.get_group(1)
            neg = grouped.get_group(0)
            neg = neg.dropna(thresh=thresh)
            df_  = neg.append(pos, verify_integrity=True, ignore_index=False)
        else:
            df_ = df_.dropna(thresh=thresh)
        
    #df_ = df_.loc[:, ~df_.columns.str.startswith('KB')]
    
    print('new df shape: ', df_.shape)
    
    return df_

In [5]:
print(np.sum(train_df['RESPONSE'] == 1))
train_df_init = default_clean(train_df, drop_threshold=80)
test_df_init = default_clean(test_df, drop_threshold=80, testing=True)
np.sum(train_df_init['RESPONSE'] == 1)

532
initial df shape:  (42962, 366)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (34178, 377)
initial df shape:  (42833, 365)
cat_cols:  ['ANREDE_KZ', 'CAMEO_DEU_2015', 'D19_KONSUMTYP', 'GEBAEUDETYP', 'GEBAEUDETYP_RASTER', 'GFK_URLAUBERTYP', 'GREEN_AVANTGARDE', 'HEALTH_TYP', 'KBA05_HERSTTEMP', 'KBA05_MAXHERST', 'KBA05_MODTEMP', 'KBA05_SEG6', 'KONSUMNAEHE', 'NATIONALITAET_KZ', 'OST_WEST_KZ', 'PRAEGENDE_JUGENDJAHRE', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP']
new df shape:  (34044, 376)


532

In [6]:
numeric_cols = [cname for cname in train_df_init.columns if 
                  train_df_init[cname].dtype in ['int64', 'float64', 'uint8']]

len(numeric_cols)

377

In [7]:
with open('experimentation/interactions.csv', 'a') as f:
    f.write('feature,composition,mu0,mu1,se0,se1,sd,mu_diff,stds_between\n')

In [10]:
def compute_data(temp0, temp1, col, composition=1):

    mu0 = temp0.mean()
    std0 = temp0.std()
    count0 = temp0.count()
    se0 = std0 / np.sqrt(count0)
    
    mu1 = temp1.mean()
    std1 = temp1.std()
    count1 = temp1.count()
    se1 = std1/np.sqrt(count1)
    
    mu_diff = abs(mu0 - mu1)
    sd = np.sqrt(se0**2 + se1**2)
    stds_between = mu_diff/sd
    return '{},{},{},{},{},{},{},{},{}\n'.format(col,composition,mu0,mu1,se0,se1,sd,mu_diff,stds_between)

In [11]:
def write_stats_one_dim():
    numeric_cols = [cname for cname in train_df_init.columns if 
                  train_df_init[cname].dtype in ['int64', 'float64', 'uint8']]
    df = train_df_init[numeric_cols].copy()
    temp0 = df.loc[df['RESPONSE'] == 0]
    temp1 = df.loc[df['RESPONSE'] == 1]
    print(temp0.shape)
    print(temp1.shape)
    stats = []
    seen = {}
    for col in numeric_cols:
        if col in seen or col == 'RESPONSE':
            continue
        seen[col] = True
        data = compute_data(temp0[col], temp1[col], col, 1)
        # data should be [feature,mu0,mu1,se0,se1,sd,mu_diff,stds_between]
        with open('experimentation/interactions.csv', 'a') as f:
            f.write(data)
        
write_stats_one_dim()   

(33646, 377)
(532, 377)


In [13]:
def write_stats_two_dim():
    numeric_cols = [cname for cname in train_df_init.columns if 
                  train_df_init[cname].dtype in ['int64', 'float64', 'uint8']]
    df = train_df_init[numeric_cols].copy()
    temp0 = df.loc[df['RESPONSE'] == 0]
    temp1 = df.loc[df['RESPONSE'] == 1]
    seen = {}
            
    for i in numeric_cols:
        if i == 'RESPONSE':
            continue
        for j in numeric_cols:
            if j == 'RESPONSE':
                continue

            if i == j:
                continue
                
            sorted_list = sorted([i, j])
            key = ':'.join(sorted_list)
            if key in seen:
                continue
                
            seen[key] = True
    
            series0 = temp0[i] * temp0[j]
            series1 = temp1[i] * temp1[j]
        
            data = compute_data(series0, series1, key, 2)
            # data should be [feature,mu0,mu1,se0,se1,mu_diff,stds_between]
            with open('experimentation/interactions.csv', 'a') as f:
                f.write(data)

        
write_stats_two_dim() 

  from ipykernel import kernelapp as app
