In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# PreProcessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,RobustScaler,MinMaxScaler
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Splitting Data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Resampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

# Modeling, Fitting and Evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, roc_auc_score, plot_roc_curve,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import datetime
from sklearn import metrics

# Boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

#feature Selection
from sklearn.feature_selection import SelectPercentile, RFE

#saving
import pickle


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/marketing-data/marketing_data.csv')

In [None]:
#most successful campaign
campaign = data.loc[:,['Response','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5']]

campaign = campaign.melt()
campaign = pd.crosstab(campaign["variable"], campaign["value"]).sort_values(0)

cols = list(campaign.columns)
a, b = cols.index(0), cols.index(1)
cols[b], cols[a] = cols[a], cols[b]
campaign = campaign[cols]

campaign.columns = "Yes","No"
campaign.plot.bar(stacked=True)
plt.title('Acceptance of Marketing Campaigns')
plt.xlabel('Campaign')
plt.ylabel('Acceptance')
plt.legend(title='Response',loc='upper right')
plt.savefig('Campaign.png')
plt.show()

**Analysis**

The Company wants to conduct a Campaign and the goal is for increase the number of member customer.Campaigns method based on the last campaign that most successful than the previous campaign. and Consumer response will be predicted based on the profile and when the last consumer purchases (Recency, if never purchase then (-1))

* *0 = No respon*
* *1 = yes*

        - TN: Consumers who are predicted will not respond to the campaign, in fact it does not respond
        - TP: Consumers are predicted to respond the campaign, actually it does respond
        - FP: Consumers who are predicted to respond the campaign, actually do not respond
        - FN: Predicted consumers do not respond to the campaign, actually respond
Error that occurred:
* FN: Wrong prediction, the company only loses prospective customers, but not financial losses
* FP: the company loses more such as time, energy and financial, because it has prepared everything for the campaign to people, but those people is not response.

**So the most influential mistake for financial losses is FP**

**The selected metric evaluation is Precision because it will press FP value**

In [None]:
#check value unique in every columns
for i in data.columns:
    result = data[i].unique()
    print (i,'\n',result,'\n')

**Data Cleansing**

In [None]:
#Make Column Customer Age
data['Dt_Customer']= pd.to_datetime(data['Dt_Customer'])
data['Customer_Age'] = data['Dt_Customer'].dt.year - data['Year_Birth']

In [None]:
#change cust join date to how long cust has joined
todayy = pd.Timestamp('28/2/21') #tanggal perhitungan terakhir
data['Dt_Customer'] = (todayy - data['Dt_Customer']).dt.days

In [None]:
#rename column income 
data.rename(columns={' Income ':'Income'},inplace=True)
data['Income']=data['Income'].str.replace('[$,]','').astype(float)

In [None]:
#Summarizing Categori from Marital_Status
data['Marital_Status'] = data['Marital_Status'].replace(['Widow','Divorced','Alone'],'Single')
data['Marital_Status'] = data['Marital_Status'].replace(['Married'],'Together')
data['Marital_Status'] = data['Marital_Status'].replace(['Absurd','YOLO'],'Other')

**Preprocessing**

preprocessing scheme:
>     one hot : education,marital_status
>     binary : country
>     drop : ID,yearbirth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Dt_Customer,ID(karena tdk ada corelasi dengan respon yang ingin mencari Customer baru)

In [None]:
mean_scale = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('scaling', RobustScaler()),
])

transformer = ColumnTransformer([
    ('impute',mean_scale,['Income']),
    ('encoder',OneHotEncoder(handle_unknown='ignore'),['Education','Marital_Status']),
    ('binary',ce.BinaryEncoder(),['Country']),
    ('scale',RobustScaler(),['Customer_Age','Recency'])
],remainder='passthrough')

data=data.drop(['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Complain','Dt_Customer','ID','Year_Birth','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','AcceptedCmp1','AcceptedCmp2'],axis=1)

In [None]:
X=data.drop(['Response'],axis=1)
y=data['Response']

In [None]:
#check transform
transformer.fit_transform(data)

# **Data Splitting**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.3,random_state=2020)

# **Model Benchmark**

*Cek Balancing Data*

In [None]:
data['Response'].value_counts()/data.shape[0]*100

data imbalance, so when the precision score is low, we can try balancing dataset for choose the best model

In [None]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 2020)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = 2020)

In [None]:
logreg_pipe = Pipeline([
    ('transform',transformer),
    ('logreg',logreg)
])

tree_pipe= Pipeline([
    ('transform',transformer),
    ('tree',tree)
])

knn_pipe =Pipeline([
    ('transform',transformer),
    ('knn',knn)
])

rf_pipe = Pipeline([
    ('transform',transformer),
    ('rf',rf)
])

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_pipe_cv = model_evaluation(logreg_pipe, 'precision')
tree_pipe_cv = model_evaluation(tree_pipe, 'precision')
knn_pipe_cv = model_evaluation(knn_pipe, 'precision')
rf_pipe_cv = model_evaluation(rf_pipe, 'precision')

for model in [logreg_pipe,tree_pipe, knn_pipe,rf_pipe]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_pipe_cv.mean(),tree_pipe_cv.mean(),knn_pipe_cv.mean(),rf_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(),tree_pipe_cv.std(),knn_pipe_cv.std(),rf_pipe_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe.predict(X_test)),
            precision_score(y_test, tree_pipe.predict(X_test)),
            precision_score(y_test, knn_pipe.predict(X_test)),
            precision_score(y_test, rf_pipe.predict(X_test))]
method_name = ['Logistic Regression','Decision Tree Classifier','KNN Classifier', 'Random Forest Classifier']
cv_result = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
cv_result

I will try for Handling imbalance dataset

# **Handling Imbalance Dataset**

*Random Under Sampling*

In [None]:
rus = RandomUnderSampler(random_state = 2020)
X_under, y_under = rus.fit_resample(X_train, y_train) 

In [None]:
logreg_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('logreg', logreg)
])

tree_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('tree', tree)
])

knn_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('knn', knn)
])

rf_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('rf', rf)
])

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric) 
    return model_cv

logreg_under_cv = model_evaluation(logreg_pipe_under, 'precision') 
tree_under_cv = model_evaluation(tree_pipe_under, 'precision')
knn_under_cv = model_evaluation(knn_pipe_under, 'precision')
rf_under_cv = model_evaluation(rf_pipe_under, 'precision')

for model in [logreg_pipe_under, tree_pipe_under, knn_pipe_under, rf_pipe_under]:
    model.fit(X_train, y_train)

score_mean = [logreg_under_cv.mean(), tree_under_cv.mean(), knn_under_cv.mean(),
              rf_under_cv.mean()]
score_std = [logreg_under_cv.std(), tree_under_cv.std(), knn_under_cv.std(),
             rf_under_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_under.predict(X_test)),
            precision_score(y_test, tree_pipe_under.predict(X_test)), 
            precision_score(y_test, knn_pipe_under.predict(X_test)), 
            precision_score(y_test, rf_pipe_under.predict(X_test))]
method_name = ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling',
              'KNN Classifier UnderSampling', 'Random Forest Classifier UnderSampling']
under_result = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
under_result

*Random Over Sampling*

In [None]:
ros = RandomOverSampler(random_state = 2020)
X_over, y_over = ros.fit_resample(X_train, y_train)

In [None]:
logreg_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('logreg', logreg)
])

tree_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('tree', tree)
])

knn_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('knn', knn)
])

rf_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros),
    ('rf', rf)
])

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_over_cv = model_evaluation(logreg_pipe_over, 'precision') 
tree_over_cv = model_evaluation(tree_pipe_over, 'precision')
knn_over_cv = model_evaluation(knn_pipe_over, 'precision')
rf_over_cv = model_evaluation(rf_pipe_over, 'precision')

for model in [logreg_pipe_over, tree_pipe_over, knn_pipe_over, rf_pipe_over]:
    model.fit(X_train, y_train)

score_mean = [logreg_over_cv.mean(), tree_over_cv.mean(), knn_over_cv.mean(),
              rf_over_cv.mean()]
score_std = [logreg_over_cv.std(), tree_over_cv.std(), knn_over_cv.std(),
             rf_over_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_over.predict(X_test)),
            precision_score(y_test, tree_pipe_over.predict(X_test)), 
            precision_score(y_test, knn_pipe_over.predict(X_test)), 
            precision_score(y_test, rf_pipe_over.predict(X_test))]
method_name = ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling',
              'KNN Classifier OverSampling', 'Random Forest Classifier OverSampling']
over_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
over_summary

*NearMiss*

In [None]:
nm = NearMiss(version = 1)

In [None]:
logreg_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('logreg', logreg)
])

tree_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('tree', tree)
])

knn_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('knn', knn)
])

rf_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('rf', rf)
])

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_nm_cv = model_evaluation(logreg_pipe_nm, 'precision') 
tree_nm_cv = model_evaluation(tree_pipe_nm, 'precision')
knn_nm_cv = model_evaluation(knn_pipe_nm, 'precision')
rf_nm_cv = model_evaluation(rf_pipe_nm, 'precision')

for model in [logreg_pipe_nm, tree_pipe_nm, knn_pipe_nm, rf_pipe_nm]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_nm_cv.mean(), tree_nm_cv.mean(), knn_nm_cv.mean(),
              rf_nm_cv.mean()]
score_std = [logreg_nm_cv.std(), tree_nm_cv.std(), knn_nm_cv.std(),
             rf_nm_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_nm.predict(X_test)),
            precision_score(y_test, tree_pipe_nm.predict(X_test)), 
            precision_score(y_test, knn_pipe_nm.predict(X_test)), 
            precision_score(y_test, rf_pipe_nm.predict(X_test))]
method_name = ['Logistic Regression NearMiss', 'Decision Tree Classifier NearMiss',
              'KNN Classifier NearMiss', 'Random Forest Classifier NearMiss']
nm_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
nm_summary

In [None]:
#Summary Balancing Dataset
resume_balancing = pd.concat([under_result,over_summary,nm_summary], axis=0)
resume_balancing

**After balancing the dataset the precision score is decrease, so model is used without balancing the dataset. And based on the resume above a stable model is KNN and Logistic regression. will then try the Boosting model**

# **BOOSTING**

In [None]:
adaboost = AdaBoostClassifier(
            tree,
            n_estimators = 50,
            learning_rate = 0.1,
            random_state = 2020)

pipe_ada = Pipeline([
    ('transformer', transformer),
    ('adaboost', adaboost)
])

gradboost = GradientBoostingClassifier(
            n_estimators = 50,
            learning_rate = 0.1,
            max_depth = 3,
            random_state = 2020)

pipe_grad = Pipeline([
    ('transformer', transformer),
    ('gradboost', gradboost)
])

XGBOOST = XGBClassifier(
            n_estimators = 50,
            learning_rate = 0.1,
            max_depth = 3,
            random_state = 2020)

pipe_XGB = Pipeline([
    ('transformer', transformer),
    ('XGBOOST', XGBOOST)
])

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric, n_jobs = -1)
    return model_cv

pipe_ada_cv = model_evaluation(pipe_ada, 'precision')
pipe_grad_cv = model_evaluation(pipe_grad, 'precision')
pipe_XGB_cv = model_evaluation(pipe_XGB, 'precision')

for model in [pipe_ada, pipe_grad, pipe_XGB]:
    model.fit(X_train, y_train)
    
score_mean = [pipe_ada_cv.mean(), pipe_grad_cv.mean(), pipe_XGB_cv.mean()]
score_std = [pipe_ada_cv.std(), pipe_grad_cv.std(), pipe_XGB_cv.std()]
score_precision_score = [precision_score(y_test, pipe_ada.predict(X_test)),
            precision_score(y_test, pipe_grad.predict(X_test)), 
            precision_score(y_test, pipe_XGB.predict(X_test))]
method_name = ['Ada Boost Classifier', 'Gradient Boost Classifier',
              'XGB Classifier']
boost_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
boost_summary

In [None]:
resume_model = pd.concat([cv_result,boost_summary], axis=0)
resume_model

In [None]:
#cek confusion matrix Logistic regression
logreg_pipe.fit(X_train, y_train)
ypred=logreg_pipe.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

In [None]:
#cek confusion matrix XGB boost
pipe_XGB.fit(X_train, y_train)
ypred=pipe_XGB.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

**Based on the resume, we will choose Logistic regression, because it has the highest precision score and precision score between class 1 and 0 is balance. then it will proceed to hyperparameter tunning **

# Hyperparameter Tunning


In [None]:
logreg=LogisticRegression()

estimator = Pipeline([
    ('transformer', transformer),
    ('model', logreg)
])

hyperparam_space =  {
    'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'model__solver': ['liblinear', 'newton-cg']
}

In [None]:
grid_search = GridSearchCV(
                estimator,
                param_grid = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'precision',
                n_jobs = -1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

In [None]:
logreg_pipe.fit(X_train, y_train)
y_pred_estimator = logreg_pipe.predict(X_test)
precision_estimator = precision_score(y_test, y_pred_estimator)

grid_search.best_estimator_.fit(X_train, y_train)
y_pred_grid = grid_search.best_estimator_.predict(X_test)
precision_best_estimator = precision_score(y_test, y_pred_grid)

score_list = [precision_estimator, precision_best_estimator]
method_name = ['Logistic Regression Before', 'Logistic Regression After']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

**Precision score before is better than after tunning, so the model will be choose is Logistic Regression without tuning**