# Arvato Customer segmentation and Classification

In this notebook we will work on the following task:
- Logistic Regression
- Decision Tree
- Random Forest
- AdaBoostClassifier
- GradientBoostingClassifier
- XGBoost
- LGBM

In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arvato-cleaned/Customers_cleaned.csv
/kaggle/input/arvato-cleaned/Azdias_cleaned.csv
/kaggle/input/arvato-cleaned/Customer_Additional_cleaned.csv
/kaggle/input/arvato/Udacity_AZDIAS_052018.csv
/kaggle/input/arvato/Udacity_MAILOUT_052018_TRAIN.csv
/kaggle/input/arvato/DIAS Attributes - Values 2017.xlsx
/kaggle/input/arvato/Udacity_MAILOUT_052018_TEST.csv
/kaggle/input/arvato/Udacity_CUSTOMERS_052018.csv
/kaggle/input/arvato/DIAS Information Levels - Attributes 2017.xlsx
/kaggle/input/attribute/attribute_cleaned.csv


In [2]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.6-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 868 kB/s eta 0:00:01
[?25hCollecting jdcal
  Downloading jdcal-1.4.1-py2.py3-none-any.whl (9.5 kB)
Collecting et-xmlfile
  Downloading et_xmlfile-1.0.1.tar.gz (8.4 kB)
Building wheels for collected packages: et-xmlfile
  Building wheel for et-xmlfile (setup.py) ... [?25ldone
[?25h  Created wheel for et-xmlfile: filename=et_xmlfile-1.0.1-py3-none-any.whl size=8913 sha256=fc7f9f8de9b3c202bd21c6e829a455dc707359e4777790da4fe00a30bff93f04
  Stored in directory: /root/.cache/pip/wheels/e2/bd/55/048b4fd505716c4c298f42ee02dffd9496bb6d212b266c7f31
Successfully built et-xmlfile
Installing collected packages: jdcal, et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 jdcal-1.4.1 openpyxl-3.0.6


In [3]:
# Reading the data that we cleaned earlier
df_azdias = pd.read_csv('../input/arvato-cleaned/Azdias_cleaned.csv')
df_customers = pd.read_csv('../input/arvato-cleaned/Customers_cleaned.csv')

# attribute: contains data about columns description
ignore_unamed_cols = lambda x:'Unnamed' not in x
attribute = pd.read_excel('../input/arvato/DIAS Attributes - Values 2017.xlsx',header=1,usecols=ignore_unamed_cols
                         ,engine='openpyxl')


# Reading train and test data
df_mailout_train = pd.read_csv('../input/arvato/Udacity_MAILOUT_052018_TRAIN.csv',sep=';')
df_mailout_test = pd.read_csv('../input/arvato/Udacity_MAILOUT_052018_TEST.csv',sep=';')


Columns (18,19) have mixed types.Specify dtype option on import or set low_memory=False.



In [4]:
attribute.head()

Unnamed: 0,Attribute,Description,Value,Meaning
0,AGER_TYP,best-ager typology,-1,unknown
1,,,0,no classification possible
2,,,1,passive elderly
3,,,2,cultural elderly
4,,,3,experience-driven elderly


In [7]:
# We will first scale the data
scaler = StandardScaler()
scaler.fit(df_azdias)
df_azdias = pd.DataFrame(scaler.transform(df_azdias), columns = df_azdias.columns)
df_customers = pd.DataFrame(scaler.transform(df_customers), columns = df_customers.columns)

In [6]:
# The below class is used for preprocessing the data

class DataProcessing:
    def column_fill(self,df,column_name):
        '''
        input
        df: dataframe
        column_name: column that need to be filled
        
        output
        df:df filled with values for missing cells
        '''
        df[column_name] = df[column_name].ffill()
        return df
    
    def replace_with_nan(self,df,cols):
        '''
        Input
        df: Dataframe
        cols: columns
        replacing column values having 'X'or 'XX' with nan
        output: df
        '''
        df[cols] = df[cols].replace({"X": np.nan, "XX": np.nan})
        df[cols] = df[cols].astype(float)
        return df
    
    def replace_zero_nan(self,df,cols):
        '''
        Input
        df: Dataframe
        cols: columns
        replacing 0 with nan
        Output
        df
        '''
        df[cols] = df[cols].replace({0: np.nan})
        df[cols] = df[cols].astype(float)
        return df
    
    def convert_to_date(self,df,cols):
        '''
        Input
        df: dataframe
        cols:columns
        desc: convert to date
        Output
        df
        '''
        df[cols] = pd.to_datetime(df[cols])
        df[cols] = df[cols].map(lambda x: x.year)
        return df
    
    

    def get_unknown_repr(self,attrib, unknown_attributes_values):
        '''
        Input
        attrib: dataframe
        unknown_attribute_values: unknown values for attributes
        
        Output
        Returns a list of unknown values
        '''
        unknown = unknown_attributes_values[unknown_attributes_values["Attribute"] == attrib]["Value"]
        unknown = unknown.astype(str).str.cat(sep=",")
        unknown = [int(x) for x in unknown.split(",")]

        return [unknown]
    
    def replace_unknown_with_nan(self,val, unknown):
        '''
        Input
        val:values
        unknown: list of unknown values
        Output
        return nan values in case of unknown values
        '''
        if val in unknown:
            return np.nan
        else:
            return val
        
    def replace_unknowns(self,df, unknown_attributes_values, verbose=False):
        '''
        Input
        df: dataframe
        Output
        Replaces unknown values to 'np.nan' in all the columns provided in unknown_attributes_values list.
        '''
        for attrib in unknown_attributes_values.Attribute:
            unknown = self.get_unknown_repr(attrib, unknown_attributes_values)
            if verbose:
                print("Replacing {} to NaN in Attribute {}".format(unknown, attrib))
            if attrib in df.columns:
                df[attrib] = df[attrib].apply(self.replace_unknown_with_nan, args=(unknown))
        return df
    
    def get_missing_report(self,df):
        '''
        Input
        df: dataframe
        Output
        returns a dataframe with information about column-wise missing values percentages.
        '''
        missing_percen = df.isna().sum() * 100/ len(df)

        missing_percen_df = pd.DataFrame({"Attribute": df.columns,
                                         "Missing_Percentage": missing_percen}).reset_index(drop=True)
        return missing_percen_df
    
    def remove_columns(self,df, remove_cols):
        '''
        Input
        df: dataframe
        remove_cols: column list
        Drops given list of columns from df
        Output
        df:dataframe
        '''
        df = df.drop(remove_cols, axis = 1)
        return df
    
    def remove_missing_columns(self,df1, df2, df1_missing, df2_missing, threshold=30):
        '''
        Input
        df1: dataframe
        df2: dataframe
        df1: dataframe containing columns having missing values above a certain threshold
        df2: dataframe containing columns having missing values above a certain threshold
        Output
        Drops columns from df1 and df2 with given threshold.
        Uses df1_missing and df2_missing to determing which columns to remove.
        If df1_missing has more missing columns (missing_percentage > threshold),
        then df1_missing is taken as reference and vice versa.
        '''

        removable_cols1 = df1_missing[df1_missing.Missing_Percentage > threshold]
        removable_cols2 = df2_missing[df2_missing.Missing_Percentage > threshold]

        if len(removable_cols1) > len(removable_cols2):
            remove_cols = removable_cols1.Attribute.tolist()
        else:
            remove_cols = removable_cols2.Attribute.tolist()

        df1 = self.remove_columns(df1, remove_cols)
        df2 = self.remove_columns(df2, remove_cols)
        print(f"\t\tRemoved {len(remove_cols)} columns from given dataframes")

        return (df1, df2, remove_cols)
    
    def remove_missing_rows(self,df, threshold, name=""):
        '''
        Input
        df: dataframe
        threshold: threshold on number of missing features
        Output
        Drops rows with number of missing features 
        as per given threshold.
        '''
        total_rows = df.shape[0]

        df = df.dropna(thresh=df.shape[1]-threshold)

        removed_rows = total_rows - df.shape[0]

        print(f"\t\tRemoved {removed_rows} rows from {name} dataframe")

        # Reset index
        df = df.reset_index()
        del df['index']

        return df
    
    def fix_ost_west_col(self,df):
        '''
        Function to label encode the feature "OST_WEST_KZ"
        '''
        df["OST_WEST_KZ"] = df["OST_WEST_KZ"].replace({"W": 0, "O": 1})

        return df
    
    def fix_anrede_col(self,df):
        '''
        Input
        df:dataframe
        Output
        Returns df with label encoding of the feature "ANREDE_KZ"
        '''
        df["ANREDE_KZ"] = df["ANREDE_KZ"].replace({1: 0, 2: 1})

        return df
    
    def fix_cameo_intl_col(self,df):
        '''
        Input
        df: dataframe
        Output
        Returns df with  additional columns containing information from 'CAMEO_INTL_2015'
        '''
        df['CAMEO_INTL_2015_WEALTH'] = df['CAMEO_INTL_2015'].apply(lambda x: np.floor_divide(float(x), 10) if float(x) else np.nan)
        df['CAMEO_INTL_2015_FAMILY'] = df['CAMEO_INTL_2015'].apply(lambda x: np.mod(float(x), 10) if float(x) else np.nan)

        df.drop("CAMEO_INTL_2015", axis=1, inplace=True)
        return df
    
    def fix_wohnlage_col(self,df):
        '''
        Input
        df: dataframe
        Output
        Returns df after replacing '0' with np.nan from "WOHNLAGE" 
        '''
        df["WOHNLAGE"] = df["WOHNLAGE"].replace({0: np.nan})

        return df
    
    def impute_values(self,df,strategy="most_frequent"):
        '''
        Input
        df: dataframe
        strategy: imutation strategy
        Output
        Returns df after imputing values
        '''
        imputer = SimpleImputer(strategy=strategy)
        df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)
        return df
    
    
    def map_lp(self,df):
        '''
        Input
        df: dataframe
        Output
        Returns df after fixing the LP* columns as LP columns contains lot of redundant information
        '''
        convert_1 = {1: 'single', 2: 'couple', 3: 'singleparent', 4: 'singleparent', 5: 'singleparent', 
                  6: 'family', 7: 'family', 8: 'family', 9: 'multihousehold', 10: 'multihousehold', 11: 'multihousehold'}
        convert_2 = {'single':0, 'couple':1, 'singleparent':2, 'family':3, 'multihousehold':4}
        df["LP_FAMILIE_GROB"] = df["LP_FAMILIE_GROB"].map(convert_1)
        df["LP_FAMILIE_GROB"] = df["LP_FAMILIE_GROB"].map(convert_2)
    
        # LP_STATUS_GROB    
        convert_1 = {1: 'lowincome', 2: 'lowincome', 3: 'avgincome', 4: 'avgincome', 5: 'avgincome', 
                      6: 'independant', 7: 'independant', 8: 'houseowner', 9: 'houseowner', 10: 'topearner'}
        convert_2 = {'lowincome':0, 'avgincome':1, 'independant':2, 'houseowner':3, 'topearner':4}

        df["LP_STATUS_GROB"] = df["LP_STATUS_GROB"].map(convert_1)
        df["LP_STATUS_GROB"] = df["LP_STATUS_GROB"].map(convert_2)


        # LP_LEBENSPHASE_FEIN
        life_stages = {1: 'younger_age', 2: 'middle_age', 3: 'younger_age',
                  4: 'middle_age', 5: 'advanced_age', 6: 'retirement_age',
                  7: 'advanced_age', 8: 'retirement_age', 9: 'middle_age',
                  10: 'middle_age', 11: 'advanced_age', 12: 'retirement_age',
                  13: 'advanced_age', 14: 'younger_age', 15: 'advanced_age',
                  16: 'advanced_age', 17: 'middle_age', 18: 'younger_age',
                  19: 'advanced_age', 20: 'advanced_age', 21: 'middle_age',
                  22: 'middle_age', 23: 'middle_age', 24: 'middle_age',
                  25: 'middle_age', 26: 'middle_age', 27: 'middle_age',
                  28: 'middle_age', 29: 'younger_age', 30: 'younger_age',
                  31: 'advanced_age', 32: 'advanced_age', 33: 'younger_age',
                  34: 'younger_age', 35: 'younger_age', 36: 'advanced_age',
                  37: 'advanced_age', 38: 'retirement_age', 39: 'middle_age',
                  40: 'retirement_age'}

        wealth_scale = {1: 'low', 2: 'low', 3: 'average', 4: 'average', 5: 'low', 6: 'low',
                  7: 'average', 8: 'average', 9: 'average', 10: 'wealthy', 11: 'average',
                  12: 'average', 13: 'top', 14: 'average', 15: 'low', 16: 'average',
                  17: 'average', 18: 'wealthy', 19: 'wealthy', 20: 'top', 21: 'low',
                  22: 'average', 23: 'wealthy', 24: 'low', 25: 'average', 26: 'average',
                  27: 'average', 28: 'top', 29: 'low', 30: 'average', 31: 'low',
                  32: 'average', 33: 'average', 34: 'average', 35: 'top', 36: 'average',
                  37: 'average', 38: 'average', 39: 'top', 40: 'top'}

        df["Temp"] = df["LP_LEBENSPHASE_FEIN"]

        df["LP_LEBENSPHASE_FEIN"] = df["LP_LEBENSPHASE_FEIN"].map(life_stages)
        df["LP_LEBENSPHASE_GROB"] = df["Temp"].map(wealth_scale)

        life_stages = {'younger_age': 1, 'middle_age': 2, 'advanced_age': 3,
                'retirement_age': 4}
        wealth_scale = {'low': 1, 'average': 2, 'wealthy': 3, 'top': 4}

        df["LP_LEBENSPHASE_FEIN"] = df["LP_LEBENSPHASE_FEIN"].map(life_stages)
        df["LP_LEBENSPHASE_GROB"] = df["LP_LEBENSPHASE_GROB"].map(wealth_scale)
        return df

In [8]:
data_process = DataProcessing()

Let's see the RESPONSE label distribution across classes

In [9]:
df_label_count = df_mailout_train['RESPONSE'].value_counts()
df_label_count = df_label_count.reset_index()
df_label_count.columns = ['label','count']
trace1 = go.Bar(x = df_label_count['label'],y = df_label_count['count'],marker=dict(color='#ffdc51'),name='')
layout = go.Layout(title = "Distribution of binary labels"
                   ,xaxis=dict(title="Labels"),
                   yaxis=dict(title="Number of data points"))
fig = go.Figure(data=[trace1],layout=layout)
iplot(fig)


## Evaluation metric
As we can see the class labels are highly imbalanced, so  The usual metric used for imbalanced classification are Precision and Recall or Area under Receiver Operating Curve (AUROC).

In [10]:
# Fixing Attributes
attribute =data_process.column_fill(attribute,'Attribute')

We will do the same data processing that we did on azdias and customers dataframe.

In [11]:
# This is a utility function to perform data processing
def clean_data(azdias,attribute,df):
    '''
    Input
    azdias: population demographic df
    attribute: column description df
    df: dataframe that needs cleaning
    Output
    df after performing data cleaning
    '''
    warn_cols = list(df.columns[18:20])
    df = data_process.replace_with_nan(df,warn_cols)
    cols = ["LP_FAMILIE_FEIN", "LP_FAMILIE_GROB", "LP_LEBENSPHASE_FEIN",
           "LP_LEBENSPHASE_GROB", "LP_STATUS_FEIN", "LP_STATUS_GROB"]
    
    # Fixing LP columns
    df = data_process.replace_zero_nan(df,cols)
    
    # Fixing EINGEFUEGT_AM column
    df = data_process.convert_to_date(df,'EINGEFUEGT_AM')
    
    unknown_attribute_values = attribute[attribute["Meaning"] == "unknown"]
    
    df = data_process.replace_unknowns(df, unknown_attribute_values)
    
    df = data_process.fix_ost_west_col(df)
    df = data_process.fix_anrede_col(df)
    df = data_process.fix_cameo_intl_col(df)
    df = data_process.fix_wohnlage_col(df)
    
    remove_cols = [col for col in df.columns if col not in azdias.columns]
    df = data_process.remove_columns(df, remove_cols)
    df = data_process.impute_values(df)
    return df

In [12]:
labels = df_mailout_train["RESPONSE"]

In [13]:
mailout_train_LNR = df_mailout_train["LNR"]

In [14]:
# Data cleaning on training data
df_mailout_train = clean_data(df_azdias,attribute,df_mailout_train)

In [15]:
df_mailout_train.shape

(42962, 352)

In [16]:
# Scaling the training data
scaler = StandardScaler()
df_mailout_train = pd.DataFrame(scaler.fit_transform(df_mailout_train), columns = df_mailout_train.columns)

In [19]:
random_seed = 22

In [18]:
# creating train and validation splits
X_train, X_val, y_train, y_val = train_test_split(df_mailout_train, labels, stratify=labels, test_size=0.2, random_state=randome_seed)

In [33]:
# Let's clean and scale the testing data also 
mailout_test_LNR = df_mailout_test["LNR"]
df_mailout_test = clean_data(df_azdias,attribute,df_mailout_test)
df_mailout_test = pd.DataFrame(scaler.transform(df_mailout_test), columns = df_mailout_test.columns)

In [20]:
def train_model(model, X_train, y_train, X_test, y_test):
    '''
    Input
    model: ML model
    X_train,y_train: training data
    X_test,y_test: validation data
    Output
    Model is trained on training data
    and 
    accuracy(AUROC score) on validation data and training and validation time is returned 
    '''
    start = time.time()
    model = model.fit(X_train, y_train)
    
    roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    
    end = time.time()
    time_elapsed = end - start
    
    return roc_score, time_elapsed

In [24]:
models = [("LogisticRegression", LogisticRegression(random_state=random_seed)),
          ("Naive Bayes", GaussianNB()),
         ("DecisionTreeClassifier", DecisionTreeClassifier(random_state=random_seed)),
         ("RandomForestClassifier", RandomForestClassifier(random_state=random_seed)),
         ("GradientBoostingClassifier", GradientBoostingClassifier(random_state=random_seed)),
         ("AdaBoostClassifier", AdaBoostClassifier(random_state=random_seed)),
         ("LGBMClassifier",lgb.LGBMClassifier(random_state=random_seed)),
         ("XGBClassifier",xgb.XGBClassifier(random_state=random_seed))]

In [25]:
import time

In [26]:
results = {"Model":[],
          "AUCROC_score":[],
          "Time_in_sec":[]}

for name, model in models:
    roc, time_ = train_model(model, X_train, y_train, X_val, y_val)
    results["Model"].append(name)
    results["AUCROC_score"].append(roc)
    results["Time_in_sec"].append(time_)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression







In [27]:
results = pd.DataFrame.from_dict(results, orient='index').transpose()
results

Unnamed: 0,Model,AUCROC_score,Time_in_sec
0,LogisticRegression,0.662067,3.745256
1,Naive Bayes,0.57564,0.228377
2,DecisionTreeClassifier,0.502246,2.85884
3,RandomForestClassifier,0.640842,10.675977
4,GradientBoostingClassifier,0.784496,56.422587
5,AdaBoostClassifier,0.751232,12.363351
6,LGBMClassifier,0.72055,3.823862
7,XGBClassifier,0.694666,19.61293


Now Let's train and tune the models having top 4 best accuracy scores

## Adaboost

In [29]:
# Adaboost
param_grid = {"n_estimators": [20,50,60],
              "learning_rate": [0.01,0.1,0.5,0.9,1.],
              "algorithm":["SAMME.R"]
              }

adaboost_grid = GridSearchCV(estimator = AdaBoostClassifier(random_state=random_seed), 
                           param_grid = param_grid, 
                           scoring = "roc_auc", 
                           cv = 5, n_jobs = -1, verbose=2)

In [30]:
adaboost_grid.fit(X_train, y_train)

best_adaboost = adaboost_grid.best_estimator_

print("Best Score: ", adaboost_grid.best_score_)
print("Best Params: ", adaboost_grid.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Score:  0.759819560231783
Best Params:  {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 50}


In [31]:
preds_adaboost = best_adaboost.predict_proba(X_val)[:,1]
print("ROC score on validation data: {:.4f}".format(roc_auc_score(y_val, preds_adaboost)))

ROC score on validation data: 0.7797


Kaggle submission for adaboost

In [35]:
preds_test_adaboost = best_adaboost.predict_proba(df_mailout_test)[:,1]

In [36]:
kaggle_adaboost = pd.DataFrame(index=mailout_test_LNR, data=preds_test_adaboost)
kaggle_adaboost.rename(columns={0: "RESPONSE"}, inplace=True)

In [37]:
kaggle_adaboost.to_csv("submission_adaboost.csv")

## GradientBoosting Classifier


In [38]:
gradient_boost_model = GradientBoostingClassifier(random_state=random_seed)

In [45]:
gradient_boost_model.fit(X_train, y_train)
print(gradient_boost_model.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 22, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [46]:
preds_gradient_boost_model = gradient_boost_model.predict_proba(X_val)[:,1]
print("ROC score on validation data: {:.4f}".format(roc_auc_score(y_val, preds_gradient_boost_model)))

ROC score on validation data: 0.7845


Kaggle submission for gradient boosting machine

In [47]:
preds_test_gradient_boost_model = gradient_boost_model.predict_proba(df_mailout_test)[:,1]

In [48]:
kaggle_gradient_boost_model = pd.DataFrame(index=mailout_test_LNR, data=preds_test_gradient_boost_model)
kaggle_gradient_boost_model.rename(columns={0: "RESPONSE"}, inplace=True)

In [49]:
kaggle_gradient_boost_model.to_csv("submission_gradient_boost.csv")

## LGBMClassifier

In [59]:
lgb_param_grid = {"max_depth": [5,10,20],
              "learning_rate": [0.01,0.1,0.5,1.],
              "gamma":[0.1,0.5,1.0],
              "n_estimators":[50,100,150]
              }

lgb_grid = GridSearchCV(estimator = lgb.LGBMClassifier(objective="binary", 
                                                       boosting_type='gbdt',
                                                            n_jobs=-1, eval_metric="auc",
                                                            silent=1,random_state=random_seed), 
                           param_grid = lgb_param_grid, 
                           scoring = "roc_auc",
                           cv = 3, n_jobs = -1, verbose=2)

In [61]:
lgb_grid.fit(X_train,y_train)
best_lgb = lgb_grid.best_estimator_

print("Best Score: ", lgb_grid.best_score_)
print("Best Params: ", lgb_grid.best_params_)

Best Score:  0.7562544306286826
Best Params:  {'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 50}


In [62]:
preds_lgb = best_lgb.predict_proba(X_val)[:,1]
print("ROC score on validation data: {:.4f}".format(roc_auc_score(y_val, preds_lgb)))

ROC score on validation data: 0.7751


Kaggle submission for lgbm

In [63]:
preds_test_lgb = best_lgb.predict_proba(df_mailout_test)[:,1]

In [64]:
kaggle_lgb = pd.DataFrame(index=mailout_test_LNR, data=preds_test_lgb)
kaggle_lgb.rename(columns={0: "RESPONSE"}, inplace=True)

In [65]:
kaggle_lgb.to_csv("submission_lgb.csv")

## XGBoost

In [68]:
# XGB classifier
xgb_param_grid = {"max_depth": [10,20,30],
              "learning_rate": [0.01],
              "gamma":[0.1],
              "n_estimators":[50,100]
              }

xgb_grid = GridSearchCV(estimator = xgb.XGBClassifier(objective="binary:logistic", 
                                                            n_jobs=-1, eval_metric="auc",
                                                            silent=1,random_state=random_seed), 
                           param_grid = xgb_param_grid, 
                           scoring = "roc_auc",
                           cv = 3, n_jobs = -1, verbose=2)


In [69]:
xgb_grid.fit(X_train,y_train)
best_xgb = xgb_grid.best_estimator_

Fitting 3 folds for each of 6 candidates, totalling 18 fits






Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [70]:
print("Best Score: ", xgb_grid.best_score_)
print("Best Params: ", xgb_grid.best_params_)

Best Score:  0.7612332805327325
Best Params:  {'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}


Kaggle submission for xgboost

In [71]:
preds_test_xgb_grid = best_xgb.predict_proba(df_mailout_test)[:,1]
kaggle_xgb_grid = pd.DataFrame(index=mailout_test_LNR, data=preds_test_xgb_grid)
kaggle_xgb_grid.rename(columns={0: "RESPONSE"}, inplace=True)

In [72]:
kaggle_xgb_grid.to_csv("submission_xgb.csv")