In [1]:
import lightgbm as lgb


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from sklearn.decomposition import PCA, KernelPCA


In [3]:
from src.helpers import *
from src.feature_selection import var_lists, get_feats
from src.read_data import *
from src.cleaner import clean_data

from src.dimensionality_reduction import *



In [4]:
%%time 

# azdias    = read_data('Udacity_AZDIAS_052018.csv')
# customers = read_data('Udacity_CUSTOMERS_052018.csv')
train     = read_data('Udacity_MAILOUT_052018_TRAIN.csv')
test      = read_data('Udacity_MAILOUT_052018_TEST.csv')

  call = lambda f, *a, **k: f(*a, **k)


Reducing memory usage...
Mem. usage decreased to 27.86 Mb (76.8% reduction)

Completed. Shape of the data:  (42962, 367)
Reducing memory usage...
Mem. usage decreased to 27.74 Mb (76.8% reduction)

Completed. Shape of the data:  (42833, 366)
CPU times: user 7.63 s, sys: 2.08 s, total: 9.71 s
Wall time: 9.74 s


In [5]:
clean_data_dict = load_dict('clean_data')

azdias_clean    = clean_data_dict['azdias_data']
customers_clean = clean_data_dict['customers_data']
RF_vars         = clean_data_dict['RF_vars']
eli5_vars       = clean_data_dict['eli5_vars']

azdias_clean.shape, customers_clean.shape, len(RF_vars), len(eli5_vars)

((178244, 296), (38330, 300), 20, 20)

In [6]:
SEED = 0

In [7]:
train_clean = clean_data(train)

Initial df shape: (42962, 366)
Variables with missing values...
Your selected dataframe has 366 columns.
There are 273 columns that have missing values.
	Dropped 5 variables
Highly correlated variables...
	Dropped 67 variables
Constant variables
	Dropped 0 variables
Final df shape: (42962, 297)


In [8]:
train_clean.head()

Unnamed: 0_level_0,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_TITEL,ARBEIT,...,VERDICHTUNGSRAUM,VHA,VHN,VK_DHT4A,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,RESPONSE,ALTERSKATEGORIE_GROB
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1763,2,1.0,8.0,8.0,15.0,0.0,0.0,1.0,0.0,3.0,...,4.0,1.0,2.0,5.0,6.0,9.0,3.0,3,0,4
1771,1,4.0,13.0,13.0,1.0,0.0,0.0,2.0,0.0,2.0,...,0.0,1.0,3.0,1.0,4.0,9.0,7.0,1,0,3
1776,1,1.0,9.0,7.0,0.0,,0.0,0.0,0.0,4.0,...,10.0,4.0,1.0,6.0,,9.0,2.0,3,0,4
1460,2,1.0,6.0,6.0,4.0,0.0,0.0,2.0,0.0,4.0,...,5.0,1.0,4.0,8.0,6.0,9.0,1.0,3,0,4
1783,2,1.0,9.0,9.0,53.0,0.0,0.0,1.0,0.0,3.0,...,4.0,0.0,4.0,2.0,6.0,9.0,3.0,3,0,3


In [9]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop = 'first'))
])

numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
 
def create_preprocessing(df):
    numerical_columns, categorical_columns = var_lists(df)

    preprocessing = ColumnTransformer(
        [('cat', categorical_pipe, categorical_columns),
         ('num', numerical_pipe, numerical_columns)])

    return preprocessing


In [10]:
def get_feats(model, cat_cols, num_cols): 
    '''Fitted rf as input'''
    
    
    ohe = (model.named_transformers_['cat']
         .named_steps['onehot'])
    
    feature_names = ohe.get_feature_names(input_features=cat_cols)
    feature_names = np.r_[feature_names, num_cols]
    
    return feature_names

In [11]:
X_train = train_clean.drop('RESPONSE', axis=1)
y_train = train_clean.RESPONSE

In [12]:
# # prepare models
# models = []
# models.append(('RF', RandomForestClassifier(random_state = SEED)))
# # models.append(('KNN', KNeighborsClassifier()))
# # models.append(('CART', DecisionTreeClassifier()))
# # models.append(('NB', GaussianNB()))
# # models.append(('SVM', SVC()))

In [13]:
# %%time

# from sklearn.model_selection import KFold
# from sklearn.metrics import roc_auc_score

# kf = KFold(n_splits=5)
# i = 0
# for train_idx, val_idx in kf.split(X):
#     i+=1
#     print(f'Fold {i}')
#     print('='*30)
#     X_train = X.iloc[train_idx, :]
#     X_val = X.iloc[val_idx, :]
#     y_train = y.iloc[train_idx]
#     y_val = y.iloc[val_idx]
    
# #     print('Train shape: ', X_train.shape, y_train.shape)
# #     print('Valid shape: ', X_val.shape, y_val.shape)
    
# #     print('Fitting preprocessor...')
#     numerical_columns, categorical_columns = var_lists(X_train)

#     preprocessing = ColumnTransformer(
#     [('cat', categorical_pipe, categorical_columns),
#      ('num', numerical_pipe, numerical_columns)])
    
#     X_train_t = preprocessing.fit_transform(X_train)
    
#     numerical_columns, categorical_columns = var_lists(X_train)

#     preprocessing = ColumnTransformer(
#     [('cat', categorical_pipe, categorical_columns),
#      ('num', numerical_pipe, numerical_columns)])
    
#     X_val_t = preprocessing.fit_transform(X_val)
    
    
#     for name, model in models:
#         model.fit(X_train_t, y_train)
#         y_pred = model.predict(X_val_t)
#         score = roc_auc_score(y_val, y_pred)
#         print(name, score)
        
    
    
#     print()

NameError: name 'X' is not defined

In [14]:
def create_pipeline(df, model):
    
    categorical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop = 'first'))
    ])

    numerical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    numerical_columns, categorical_columns = var_lists(df)

    preprocessing = ColumnTransformer(
        [('cat', categorical_pipe, categorical_columns),
         ('num', numerical_pipe, numerical_columns)])
    
    classifier_pipe = Pipeline([
        ('preprocess', preprocessing),
        ('classifier', model)
    ])

    return classifier_pipe

In [15]:
classifier_pipe = create_pipeline(X_train, RandomForestClassifier(random_state = SEED))

In [16]:
scores = cross_val_score(classifier_pipe, X_train, y_train, cv=5, scoring = 'roc_auc')
scores

array([0.55351916, 0.63829375, 0.5906343 , 0.58116754, 0.62192779])

In [17]:
scores = cross_val_score(classifier_pipe, X_train, y_train, cv=StratifiedKFold(3), scoring = 'roc_auc')
scores

array([0.6218496 , 0.59537215, 0.61730864])

In [18]:
scores.mean()

0.6115101272284319

# Improvements

## Best variables

In [19]:
X_train[RF_vars]

Unnamed: 0_level_0,D19_SOZIALES,D19_KONSUMTYP,ALTERSKATEGORIE_GROB,CJT_TYP_5,CJT_TYP_6,RT_SCHNAEPPCHEN,ALTERSKATEGORIE_FEIN,LP_STATUS_FEIN,CJT_TYP_1,HH_EINKOMMEN_SCORE,CJT_TYP_3,LP_FAMILIE_FEIN,RT_KEIN_ANREIZ,AKT_DAT_KL,GFK_URLAUBERTYP,FINANZ_MINIMALIST,EINGEZOGENAM_HH_JAHR,AGER_TYP,ANZ_HAUSHALTE_AKTIV,CJT_GESAMTTYP
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1763,1.0,3.0,4,5.0,5.0,5.0,8.0,3.0,2.0,6.0,5.0,1.0,2.0,1.0,8.0,3,2004.0,2,15.0,2.0
1771,5.0,1.0,3,5.0,4.0,1.0,13.0,9.0,2.0,1.0,4.0,2.0,3.0,4.0,8.0,5,1994.0,1,1.0,2.0
1776,2.0,2.0,4,5.0,5.0,5.0,7.0,10.0,1.0,1.0,5.0,0.0,1.0,1.0,3.0,5,1997.0,1,0.0,4.0
1460,1.0,3.0,4,5.0,4.0,5.0,6.0,3.0,2.0,4.0,5.0,2.0,2.0,1.0,5.0,4,1994.0,2,4.0,2.0
1783,1.0,2.0,3,5.0,5.0,5.0,9.0,6.0,1.0,4.0,5.0,1.0,1.0,1.0,4.0,3,1994.0,2,53.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66338,3.0,1.0,4,5.0,4.0,2.0,10.0,10.0,1.0,1.0,5.0,11.0,1.0,1.0,10.0,5,2010.0,2,1.0,3.0
67629,1.0,2.0,3,4.0,5.0,5.0,14.0,9.0,2.0,4.0,4.0,10.0,1.0,1.0,10.0,5,2001.0,-1,1.0,3.0
68273,1.0,2.0,4,5.0,5.0,5.0,10.0,9.0,1.0,4.0,5.0,10.0,4.0,1.0,6.0,5,1994.0,1,2.0,5.0
68581,0.0,4.0,4,3.0,5.0,2.0,13.0,1.0,3.0,5.0,4.0,10.0,4.0,1.0,2.0,2,1994.0,2,3.0,5.0


In [20]:
classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

scores = cross_val_score(classifier_pipe, X_train, y_train, cv=StratifiedKFold(3), scoring = 'roc_auc')
scores.mean()

0.6782882921859069

## Tuning hyperparameters

### Regular tuning

In [28]:
n_estimators = list(range(20,100,40))
max_depth = list(range(2,20,5))
max_features = list(range(2,20,5))

In [29]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features)
                
#                  classifier__class_weight=class_weight

In [30]:
%%time

classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

# Create a grid search object
clf = GridSearchCV(classifier_pipe, parameters)

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 2min 34s, sys: 2.94 s, total: 2min 37s
Wall time: 2min 37s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                   

In [31]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samples_split:', clf.best_estimator_.get_params()['classifier__min_samples_split'])

Best n_estimators: 20
Best max_depth: 2
Best max_features: 2
Best min_samples_split: 2


In [32]:
# Fit the grid search using 3-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(3), scoring = 'roc_auc')
scores.mean()

0.706542445967434

#### Randomized

In [33]:
n_estimators = list(range(10,150,10))
max_depth = list(range(1,20,3))
max_features = list(range(1,20,3))
min_samples_split = [2,3,4,5]
# class_weight = ['balanced', 'balanced_subsample']

In [34]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features,
                 classifier__min_samples_split=min_samples_split)
                
#                  classifier__class_weight=class_weight

In [35]:
%%time

classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

# Create a grid search object
clf = RandomizedSearchCV(classifier_pipe, parameters, n_iter = 10, random_state = SEED)

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 1min 21s, sys: 910 ms, total: 1min 21s
Wall time: 1min 22s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocess',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                    

In [36]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samples_split:', clf.best_estimator_.get_params()['classifier__min_samples_split'])

Best n_estimators: 30
Best max_depth: 19
Best max_features: 19
Best min_samples_split: 5


In [37]:
# Fit the grid search using 3-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7077893956097594

## Pipeline with PCA

In [38]:
def create_pipeline_PCA(df, pca_model, classifier):
    
    categorical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop = 'first'))
    ])

    numerical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    numerical_columns, categorical_columns = var_lists(df)

    preprocessing = ColumnTransformer(
        [('cat', categorical_pipe, categorical_columns),
         ('num', numerical_pipe, numerical_columns)])
    
    classifier_pipe = Pipeline([
        ('preprocess', preprocessing),
        ('pca', pca_model),
        ('classifier', classifier)
    ])

    return classifier_pipe

In [None]:
classifier_pipe_pca = create_pipeline_PCA(X_train, PCA(50), RandomForestClassifier(random_state = SEED))

In [None]:
scores = cross_val_score(classifier_pipe_pca, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

In [39]:
classifier_pipe_pca = create_pipeline_PCA(X_train, KernelPCA(5), RandomForestClassifier(random_state = SEED))

In [40]:
%%time 

scores = cross_val_score(classifier_pipe_pca, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

CPU times: user 17min 12s, sys: 7min 50s, total: 25min 3s
Wall time: 16min 2s


0.4997928605676332

# Lightgbm

In [41]:
lgb_pipe = create_pipeline(X_train, 
                                  lgb.LGBMClassifier(n_estimators = 15,
                                                     class_weight = 'balanced', 
                                                     random_state = SEED))


In [42]:
scores = cross_val_score(lgb_pipe, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

0.7445071821178351

In [44]:
lgb_pipe = create_pipeline(X_train[RF_vars], 
                                  lgb.LGBMClassifier(n_estimators = 15,
                                                     class_weight = 'balanced', 
                                                     random_state = SEED))


In [45]:
scores = cross_val_score(lgb_pipe, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

0.7574308414165658

## Hyperparameter tuning

In [64]:
n_estimators = list(range(10,150,10))
max_depth = list(range(1,20,3))
max_features = list(range(1,20,3))
num_leaves = [2,3,4,5]

In [65]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features,
                  classifier__num_leaves=num_leaves)

In [66]:
%%time

lgb_pipe = create_pipeline(X_train[RF_vars], lgb.LGBMClassifier(random_state = SEED))

# Create a grid search object
clf = RandomizedSearchCV(lgb_pipe, parameters, n_iter = 20, random_state = SEED, scoring = 'roc_auc', refit = 'roc_auc')

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 4min 55s, sys: 7.95 s, total: 5min 3s
Wall time: 26.6 s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocess',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                    

In [67]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samplnum_leaveses_split:', clf.best_estimator_.get_params()['classifier__num_leaves'])

Best n_estimators: 30
Best max_depth: 19
Best max_features: 1
Best min_samplnum_leaveses_split: 4


In [68]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7693842333403556

In [70]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(clf.best_estimator_, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7710602707092367

# Prepare submission

In [97]:
test_clean = clean_data(test)

Initial df shape: (42833, 365)
Variables with missing values...
Your selected dataframe has 365 columns.
There are 273 columns that have missing values.
	Dropped 5 variables
Highly correlated variables...
	Dropped 68 variables
Constant variables
	Dropped 0 variables
Final df shape: (42833, 296)


In [98]:
final_model = clf.best_estimator_
final_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                        

In [99]:
test_clean = final_model[0].fit_transform(test_clean)

In [100]:
test_clean.shape, train_clean.shape

((42833, 20), (42962, 296))

In [101]:
sub = final_model[1].predict_proba(test_clean)[:,1]

In [102]:
submission = pd.DataFrame({'LNR':test.LNR, 'RESPONSE':sub})
submission.head(10)

Unnamed: 0,LNR,RESPONSE
0,1754,0.029099
1,1770,0.026021
2,1465,0.004303
3,1470,0.007838
4,1478,0.003802
5,1782,0.003858
6,1485,0.003037
7,1519,0.022407
8,1835,0.029174
9,1522,0.004409


In [105]:
submission.to_csv('data/submission_lgb_1.csv', index = False)

Oversampling