In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from sklearn.decomposition import PCA

import lightgbm as lgb

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
from src.helpers import *
from src.feature_selection import var_lists, get_feats
from src.read_data import *
from src.cleaner import clean_data

from src.dimensionality_reduction import *

from src.modeling import *

In [3]:
%%time 

train     = read_data('Udacity_MAILOUT_052018_TRAIN.csv')
test      = read_data('Udacity_MAILOUT_052018_TEST.csv')

Reducing memory usage...
Mem. usage decreased to 27.86 Mb (76.8% reduction)

Completed. Shape of the data:  (42962, 367)
Reducing memory usage...
Mem. usage decreased to 27.74 Mb (76.8% reduction)

Completed. Shape of the data:  (42833, 366)
CPU times: user 7.73 s, sys: 2.5 s, total: 10.2 s
Wall time: 10.2 s


In [4]:
clean_data_dict = load_dict('clean_data')

azdias_clean    = clean_data_dict['azdias_data']
customers_clean = clean_data_dict['customers_data']
RF_vars         = clean_data_dict['RF_vars']
eli5_vars       = clean_data_dict['eli5_vars']

azdias_clean.shape, customers_clean.shape, len(RF_vars), len(eli5_vars)

data/clean_data.pickle


((178244, 296), (38330, 300), 20, 20)

In [5]:
SEED = 0

In [6]:
%%time

train_clean = clean_data(train)
print()
test_clean = clean_data(test)

Initial df shape: (42962, 366)
Variables with missing values...
Your selected dataframe has 366 columns.
There are 273 columns that have missing values.
	Dropped 5 variables
Highly correlated variables...
	Dropped 67 variables
Constant variables
	Dropped 0 variables
Final df shape: (42962, 297)

Initial df shape: (42833, 365)
Variables with missing values...
Your selected dataframe has 365 columns.
There are 273 columns that have missing values.
	Dropped 5 variables
Highly correlated variables...
	Dropped 68 variables
Constant variables
	Dropped 0 variables
Final df shape: (42833, 296)
CPU times: user 26.9 s, sys: 526 ms, total: 27.5 s
Wall time: 27.5 s


In [7]:
final_vars = list(set(test_clean.columns).intersection(set(train_clean.columns)))
test_clean = test_clean[final_vars]
final_vars.extend(['RESPONSE'])
train_clean = train_clean[final_vars]

train_clean.shape, test_clean.shape

((42962, 295), (42833, 294))

In [8]:
X_train = train_clean.drop('RESPONSE', axis=1)
y_train = train_clean.RESPONSE

# Baseline model

In [9]:
classifier_pipe = create_pipeline(X_train, RandomForestClassifier(random_state = SEED))

In [10]:
scores = cross_val_score(classifier_pipe, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.6119295816113446

# Improvements

## Best variables
----
Select the most important variables for the Random Forest trained for customers data.

In [11]:
X_train[RF_vars]

Unnamed: 0_level_0,D19_SOZIALES,D19_KONSUMTYP,ALTERSKATEGORIE_GROB,CJT_TYP_5,CJT_TYP_6,RT_SCHNAEPPCHEN,ALTERSKATEGORIE_FEIN,LP_STATUS_FEIN,CJT_TYP_1,HH_EINKOMMEN_SCORE,CJT_TYP_3,LP_FAMILIE_FEIN,RT_KEIN_ANREIZ,AKT_DAT_KL,GFK_URLAUBERTYP,FINANZ_MINIMALIST,EINGEZOGENAM_HH_JAHR,AGER_TYP,ANZ_HAUSHALTE_AKTIV,CJT_GESAMTTYP
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1763,1.0,3.0,4,5.0,5.0,5.0,8.0,3.0,2.0,6.0,5.0,1.0,2.0,1.0,8.0,3,2004.0,2,15.0,2.0
1771,5.0,1.0,3,5.0,4.0,1.0,13.0,9.0,2.0,1.0,4.0,2.0,3.0,4.0,8.0,5,1994.0,1,1.0,2.0
1776,2.0,2.0,4,5.0,5.0,5.0,7.0,10.0,1.0,1.0,5.0,0.0,1.0,1.0,3.0,5,1997.0,1,0.0,4.0
1460,1.0,3.0,4,5.0,4.0,5.0,6.0,3.0,2.0,4.0,5.0,2.0,2.0,1.0,5.0,4,1994.0,2,4.0,2.0
1783,1.0,2.0,3,5.0,5.0,5.0,9.0,6.0,1.0,4.0,5.0,1.0,1.0,1.0,4.0,3,1994.0,2,53.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66338,3.0,1.0,4,5.0,4.0,2.0,10.0,10.0,1.0,1.0,5.0,11.0,1.0,1.0,10.0,5,2010.0,2,1.0,3.0
67629,1.0,2.0,3,4.0,5.0,5.0,14.0,9.0,2.0,4.0,4.0,10.0,1.0,1.0,10.0,5,2001.0,-1,1.0,3.0
68273,1.0,2.0,4,5.0,5.0,5.0,10.0,9.0,1.0,4.0,5.0,10.0,4.0,1.0,6.0,5,1994.0,1,2.0,5.0
68581,0.0,4.0,4,3.0,5.0,2.0,13.0,1.0,3.0,5.0,4.0,10.0,4.0,1.0,2.0,2,1994.0,2,3.0,5.0


In [12]:
classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

scores = cross_val_score(classifier_pipe, X_train, y_train, cv=StratifiedKFold(3), scoring = 'roc_auc')
scores.mean()

0.6645869426598469

Quite an improvement!

## Tuning hyperparameters

### Regular tuning

In [13]:
n_estimators = list(range(20,100,40))
max_depth = list(range(2,20,5))
max_features = list(range(2,20,5))

In [14]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features)

In [15]:
%%time

classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

# Create a grid search object
clf = GridSearchCV(classifier_pipe, parameters)

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 2min 28s, sys: 2.24 s, total: 2min 30s
Wall time: 2min 30s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                   

In [16]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samples_split:', clf.best_estimator_.get_params()['classifier__min_samples_split'])

Best n_estimators: 20
Best max_depth: 2
Best max_features: 2
Best min_samples_split: 2


In [17]:
# Fit the grid search using 3-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(3), scoring = 'roc_auc')
scores.mean()

0.7184531143623717

### Randomized

In [18]:
n_estimators = list(range(10,150,10))
max_depth = list(range(1,20,3))
max_features = list(range(1,20,3))
min_samples_split = [2,3,4,5]
# class_weight = ['balanced', 'balanced_subsample']

In [19]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features,
                 classifier__min_samples_split=min_samples_split)
                
#                  classifier__class_weight=class_weight

In [20]:
%%time

classifier_pipe = create_pipeline(X_train[RF_vars], RandomForestClassifier(random_state = SEED))

# Create a grid search object
clf = RandomizedSearchCV(classifier_pipe, parameters, n_iter = 10, random_state = SEED)

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 1min 21s, sys: 795 ms, total: 1min 22s
Wall time: 1min 22s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocess',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                    

In [21]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samples_split:', clf.best_estimator_.get_params()['classifier__min_samples_split'])

Best n_estimators: 40
Best max_depth: 19
Best max_features: 13
Best min_samples_split: 4


In [22]:
# Fit the grid search using 3-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7041720858149552

## Pipeline with PCA

In [23]:
classifier_pipe_pca = create_pipeline_PCA(X_train, PCA(80), RandomForestClassifier(random_state = SEED))

In [24]:
scores = cross_val_score(classifier_pipe_pca, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

0.5503777244194417

# Lightgbm

--------

Let's try with Lightgbm to see if we can improve the score.

In [25]:
lgb_pipe = create_pipeline(X_train, 
                                  lgb.LGBMClassifier(n_estimators = 15,
                                                     class_weight = 'balanced', 
                                                     random_state = SEED))


In [26]:
scores = cross_val_score(lgb_pipe, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

0.7450951649886234

In [27]:
lgb_pipe = create_pipeline(X_train[RF_vars], 
                                  lgb.LGBMClassifier(n_estimators = 15,
                                                     class_weight = 'balanced', 
                                                     random_state = SEED))


In [28]:
scores = cross_val_score(lgb_pipe, X_train, y_train, cv=5, scoring = 'roc_auc')
scores.mean()

0.7574310616803978

## Hyperparameter tuning

In [29]:
n_estimators = list(range(10,150,10))
max_depth = list(range(1,20,3))
max_features = list(range(1,20,3))
num_leaves = [2,3,4,5]

In [30]:
parameters = dict(classifier__n_estimators=n_estimators, 
                  classifier__max_depth=max_depth,
                  classifier__max_features=max_features,
                  classifier__num_leaves=num_leaves)

In [31]:
%%time

lgb_pipe = create_pipeline(X_train[RF_vars], lgb.LGBMClassifier(random_state = SEED))

# Create a grid search object
clf = RandomizedSearchCV(lgb_pipe, parameters, n_iter = 20, random_state = SEED, scoring = 'roc_auc', refit = 'roc_auc')

# Fit the grid search
clf.fit(X_train, y_train)

CPU times: user 4min 58s, sys: 9.87 s, total: 5min 8s
Wall time: 27.1 s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocess',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('cat',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                    

In [32]:
# View The Best Parameters
print('Best n_estimators:', clf.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf.best_estimator_.get_params()['classifier__max_features'])
print('Best min_samplnum_leaveses_split:', clf.best_estimator_.get_params()['classifier__num_leaves'])

Best n_estimators: 30
Best max_depth: 19
Best max_features: 1
Best min_samplnum_leaveses_split: 4


In [33]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7693842333403556

In [34]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(clf.best_estimator_, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7710602707092367

## Prepare submission

In [35]:
# select model and train with training data
final_model_1 = clf.best_estimator_
final_model_1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                        

In [36]:
test.LNR

0         1754
1         1770
2         1465
3         1470
4         1478
         ...  
42828    67615
42829    67938
42830    67942
42831    67949
42832    68898
Name: LNR, Length: 42833, dtype: int32

In [37]:
create_submission(test = test_clean, model = final_model_1, filename = 'submission_lgb_1.csv', index = test.LNR)

Submission stored in data/submission_lgb_1.csv


## Oversampling

----

As the data is highly imbalaced, let's use a oversampling technique to see if the results improve

In [39]:
train.RESPONSE.value_counts()

0    42430
1      532
Name: RESPONSE, dtype: int64

In [41]:
sm_pipe = create_pipeline_smote(X_train[RF_vars], lgb.LGBMClassifier(random_state = SEED), 1)

In [42]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(sm_pipe, X_train, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

0.7413126812163533

In [43]:
sm_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                     

In [44]:
create_submission(test_clean, sm_pipe, filename = 'submission_lgb_imb.csv', index = test.LNR)

Submission stored in data/submission_lgb_imb.csv


## Interaction terms
----
Try to compute interaction terms for some of the most important variables. If the score improves, then keep those terms.

In [45]:
interaction_vars = RF_vars[:10]
interaction_vars

['D19_SOZIALES',
 'D19_KONSUMTYP',
 'ALTERSKATEGORIE_GROB',
 'CJT_TYP_5',
 'CJT_TYP_6',
 'RT_SCHNAEPPCHEN',
 'ALTERSKATEGORIE_FEIN',
 'LP_STATUS_FEIN',
 'CJT_TYP_1',
 'HH_EINKOMMEN_SCORE']

In [48]:
interactions_df = determine_interactions(X_train, y_train, interaction_vars = interaction_vars)

X_train_fi = create_fi(X_train, var_list = RF_vars, 
                       feats_a = interactions_df.feature_A, feats_b = interactions_df.feature_B)

Baseline: 0.7510151627544001

Top 10 interactions
       feature_A           feature_B  metric
0   D19_SOZIALES  HH_EINKOMMEN_SCORE   0.761
2   D19_SOZIALES       D19_KONSUMTYP   0.760
3  D19_KONSUMTYP      LP_STATUS_FEIN   0.757
1   D19_SOZIALES     RT_SCHNAEPPCHEN   0.756


NameError: name 'RF_vars' is not defined

In [None]:
%%time

lgb_pipe_fi = create_pipeline(X_train_fi, lgb.LGBMClassifier(random_state = SEED))

# Create a grid search object
clf_fi = RandomizedSearchCV(lgb_pipe_fi, parameters, n_iter = 10, random_state = SEED, scoring = 'roc_auc', refit = 'roc_auc')

# Fit the grid search
clf_fi.fit(X_train_fi, y_train)


# View The Best Parameters
print('Best n_estimators:', clf_fi.best_estimator_.get_params()['classifier__n_estimators'])
print('Best max_depth:', clf_fi.best_estimator_.get_params()['classifier__max_depth'])
print('Best max_features:', clf_fi.best_estimator_.get_params()['classifier__max_features'])
print('Best num_leaveses_split:', clf_fi.best_estimator_.get_params()['classifier__num_leaves'])
print()

In [None]:
# Fit the grid search using 5-Fold cross validation
scores = cross_val_score(clf_fi.best_estimator_, X_train_fi, y_train, cv=StratifiedKFold(5), scoring = 'roc_auc')
scores.mean()

In [None]:
final_model_fi = clf_fi.best_estimator_
final_model_fi.fit(X_train_fi, y_train)

In [None]:
test_fi = create_fi(test_clean, var_list = RF_vars, 
                       feats_a = interactions_df.feature_A, feats_b = interactions_df.feature_B)

In [None]:
create_submission(test_fi, final_model_fi, filename = 'submission_lgb_fi.csv', index = test.LNR)