In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# PreProcessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,RobustScaler,MinMaxScaler
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Splitting Data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Resampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

# Modeling, Fitting and Evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, roc_auc_score, plot_roc_curve,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import datetime
from sklearn import metrics

# Boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

#feature Selection
from sklearn.feature_selection import SelectPercentile, RFE

#clustering
from scipy.spatial.distance import cdist,pdist
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

#saving
import joblib
import pickle


In [2]:
data = pd.read_csv('marketing_data.csv')

In [3]:
data

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country
0,1826,1970,Graduation,Divorced,"$84,835.00",0,0,6/16/14,0,189,...,6,1,0,0,0,0,0,1,0,SP
1,1,1961,Graduation,Single,"$57,091.00",0,0,6/15/14,0,464,...,7,5,0,0,0,0,1,1,0,CA
2,10476,1958,Graduation,Married,"$67,267.00",0,1,5/13/14,0,134,...,5,2,0,0,0,0,0,0,0,US
3,1386,1967,Graduation,Together,"$32,474.00",1,1,5/11/14,0,10,...,2,7,0,0,0,0,0,0,0,AUS
4,5371,1989,Graduation,Single,"$21,474.00",1,0,4/8/14,0,6,...,2,7,1,0,0,0,0,1,0,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142,1976,PhD,Divorced,"$66,476.00",0,1,3/7/13,99,372,...,11,4,0,0,0,0,0,0,0,US
2236,5263,1977,2n Cycle,Married,"$31,056.00",1,0,1/22/13,99,5,...,3,8,0,0,0,0,0,0,0,SP
2237,22,1976,Graduation,Divorced,"$46,310.00",1,0,12/3/12,99,185,...,5,8,0,0,0,0,0,0,0,SP
2238,528,1978,Graduation,Married,"$65,819.00",0,0,11/29/12,99,267,...,10,3,0,0,0,0,0,0,0,IND


In [4]:
#unique dari setiap column
for i in data.columns:
    result = data[i].unique()
    print (i,'\n',result,'\n')

ID 
 [ 1826     1 10476 ...    22   528  4070] 

Year_Birth 
 [1970 1961 1958 1967 1989 1954 1947 1979 1959 1981 1969 1977 1960 1966
 1976 1965 1956 1975 1971 1986 1972 1974 1990 1987 1984 1968 1955 1983
 1973 1978 1952 1962 1964 1982 1963 1957 1980 1945 1949 1948 1953 1946
 1985 1992 1944 1951 1988 1950 1994 1993 1991 1893 1996 1995 1899 1943
 1941 1940 1900] 

Education 
 ['Graduation' 'PhD' '2n Cycle' 'Master' 'Basic'] 

Marital_Status 
 ['Divorced' 'Single' 'Married' 'Together' 'Widow' 'YOLO' 'Alone' 'Absurd'] 

 Income  
 ['$84,835.00 ' '$57,091.00 ' '$67,267.00 ' ... '$46,310.00 ' '$65,819.00 '
 '$94,871.00 '] 

Kidhome 
 [0 1 2] 

Teenhome 
 [0 1 2] 

Dt_Customer 
 ['6/16/14' '6/15/14' '5/13/14' '5/11/14' '4/8/14' '3/17/14' '1/29/14'
 '1/18/14' '1/11/14' '12/27/13' '12/9/13' '12/7/13' '10/16/13' '10/5/13'
 '9/11/13' '8/1/13' '7/23/13' '7/1/13' '5/28/13' '3/26/13' '3/15/13'
 '2/12/13' '11/23/12' '10/13/12' '9/14/12' '6/29/14' '5/31/14' '5/30/14'
 '4/27/14' '4/11/14' '10/29/13' '1

**Data Cleansing**

In [5]:
#menambahkan column umur berdasarkan pengurangan thun bergabung dengan tahun lahir
data['Dt_Customer']= pd.to_datetime(data['Dt_Customer'])
data['Customer_Age'] = data['Dt_Customer'].dt.year - data['Year_Birth']

In [6]:
#merubah tanggal cust bergabung menjadi berapa lama cust telah bergabung 
todayy = pd.Timestamp('28/2/21') #tanggal perhitungan terakhir
data['Dt_Customer'] = (todayy - data['Dt_Customer']).dt.days

In [7]:
#rename col income untuk memudahkan dlm pemanggilan dan merubah mjd numeric
data.rename(columns={' Income ':'Income'},inplace=True)
data['Income']=data['Income'].str.replace('[$,]','').astype(float)

In [8]:
data['Dt_Customer']

0       2449
1       2450
2       2483
3       2485
4       2518
        ... 
2235    2915
2236    2959
2237    3009
2238    3013
2239    3102
Name: Dt_Customer, Length: 2240, dtype: int64

In [9]:
data['Marital_Status'] = data['Marital_Status'].replace(['Widow','Divorced','Alone'],'Single')
data['Marital_Status'] = data['Marital_Status'].replace(['Married'],'Together')
data['Marital_Status'] = data['Marital_Status'].replace(['Absurd','YOLO'],'Other')

In [10]:
data['Marital_Status'].unique()

array(['Single', 'Together', 'Other'], dtype=object)

*Missing Value*

In [11]:
data.isna().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Response                0
Complain                0
Country                 0
Customer_Age            0
dtype: int64

Pengisian missing value dimasukkan kedalam column transformer

*Outlier*

In [12]:
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
IQR = q3-q1

In [13]:
((data < (q1-1.5*IQR)) | (data > (q3+1.5*IQR))).sum()/len(data)*100

AcceptedCmp1            6.428571
AcceptedCmp2            1.339286
AcceptedCmp3            7.276786
AcceptedCmp4            7.455357
AcceptedCmp5            7.276786
Complain                0.937500
Country                 0.000000
Customer_Age            0.133929
Dt_Customer             0.000000
Education               0.000000
ID                      0.000000
Income                  0.357143
Kidhome                 0.000000
Marital_Status          0.000000
MntFishProducts         9.955357
MntFruits              10.133929
MntGoldProds            9.241071
MntMeatProducts         7.812500
MntSweetProducts       11.071429
MntWines                1.562500
NumCatalogPurchases     1.026786
NumDealsPurchases       3.839286
NumStorePurchases       0.000000
NumWebPurchases         0.178571
NumWebVisitsMonth       0.357143
Recency                 0.000000
Response               14.910714
Teenhome                0.000000
Year_Birth              0.133929
dtype: float64

Terdapat nilai outlier diatas 10% di beberapa column, jadi jika menggunakan scalling maka akan menggunakan Robust Scaller

**Preprocessing**

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   int64  
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [15]:
mean_scale = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('scaling', RobustScaler()),
])

transformer = ColumnTransformer([
    ('impute',mean_scale,['Income']),
    ('encoder',OneHotEncoder(handle_unknown='ignore'),['Education','Marital_Status']),
    ('binary',ce.BinaryEncoder(),['Country']),
    ('scale',RobustScaler(),['Customer_Age','Recency'])
],remainder='passthrough')

data=data.drop(['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Complain','Dt_Customer','ID','Year_Birth','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','AcceptedCmp1','AcceptedCmp2'],axis=1)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Education       2240 non-null   object 
 1   Marital_Status  2240 non-null   object 
 2   Income          2216 non-null   float64
 3   Kidhome         2240 non-null   int64  
 4   Teenhome        2240 non-null   int64  
 5   Recency         2240 non-null   int64  
 6   Response        2240 non-null   int64  
 7   Country         2240 non-null   object 
 8   Customer_Age    2240 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 157.6+ KB


In [17]:
X=data.drop(['Response'],axis=1)
y=data['Response']

In [18]:
#cek transform
transformer.fit_transform(data)

array([[ 1.0104577 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.16333852,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.47404659,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.16584226,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.4298342 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.31689109,  0.        ,  0.        , ...,  0.        ,
         2.        ,  1.        ]])

**Data Splitting**

In [19]:
X.shape

(2240, 8)

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.3,random_state=2020)

**Analysis**

1. Company ingin melakukan Campaign dengan tujuan menarik konsumen yang belum menjadi member/Customer tetap. Dengan meningkatnya jumlah customer member maka akan memudahkan company dalam hal branding serta evaluasi berdasarkan saran-saran dari customer. Campaign yang dilakukan berdasarkan dengan metode campaign terakhir yang memiliki kesuksesan dibandingkan campaign sebelumnya. dan Respon konsumen akan di prediksi berdasarkan profil serta kapan terakhir konsumen tersebut membeli product kita(Recency, jika blm pernah beli maka (-1))

*Customer = Konsumen yang telah menjadi member*

* *0 = No respon*
* *1 = yes*

        - TN: Konsumen yang diprediksi tidak akan merespon campaign, actualnya memang tidak merespon
        - TP: Konsumen yang diprediksi akan merespon campaign, actualnya memang merespon
        - FP: Konsumen yang diprediksi akan merespon campaign, actualnya tidak merespon
        - FN: Konsumen yang diprediksi tidak merespon campaign, actualnya merespon

2 Kesalahan yang terjadi:
* FN: Salah prediksi, company hanya kehilangan calon customer, tetapi tidak rugi financial
* FP: company lebih rugi waktu, tenaga dan financial, karena telah menyiapkan segala sesuatu untuk campaign ke orang tsb, namun ternyata tidak ada respon.

**Jadi kesalahan yang paling berpengaruh untuk kerugian financial adalah FP**

**Metric evaluasi yang dipilih adalah Precision karena akan menekan nilai FP, Nilai FP dan Precision berbanding terbalik**

# **Model BenchMark**

*Cek Balancing Data*

In [21]:
data['Response'].value_counts()/data.shape[0]*100

0    85.089286
1    14.910714
Name: Response, dtype: float64

data imbalance, jd ketika nilai masih rendah bisa dilkukan balancing untuk memilih model terbaik

In [22]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 2020)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = 2020)

In [23]:
logreg_pipe = Pipeline([
    ('transform',transformer),
    ('logreg',logreg)
])

tree_pipe= Pipeline([
    ('transform',transformer),
    ('tree',tree)
])

knn_pipe =Pipeline([
    ('transform',transformer),
    ('knn',knn)
])

rf_pipe = Pipeline([
    ('transform',transformer),
    ('rf',rf)
])

In [24]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_pipe_cv = model_evaluation(logreg_pipe, 'precision')
tree_pipe_cv = model_evaluation(tree_pipe, 'precision')
knn_pipe_cv = model_evaluation(knn_pipe, 'precision')
rf_pipe_cv = model_evaluation(rf_pipe, 'precision')

for model in [logreg_pipe,tree_pipe, knn_pipe,rf_pipe]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_pipe_cv.mean(),tree_pipe_cv.mean(),knn_pipe_cv.mean(),rf_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(),tree_pipe_cv.std(),knn_pipe_cv.std(),rf_pipe_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe.predict(X_test)),
            precision_score(y_test, tree_pipe.predict(X_test)),
            precision_score(y_test, knn_pipe.predict(X_test)),
            precision_score(y_test, rf_pipe.predict(X_test))]
method_name = ['Logistic Regression','Decision Tree Classifier','KNN Classifier', 'Random Forest Classifier']
cv_result = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
cv_result

Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression,0.528766,0.199787,0.823529
1,Decision Tree Classifier,0.252752,0.064072,0.407407
2,KNN Classifier,0.369554,0.057294,0.466667
3,Random Forest Classifier,0.513016,0.09092,0.724138


In [25]:
#cek confusion matrix dr model terbaik
logreg_pipe.fit(X_train, y_train)
ypred=logreg_pipe.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       572
           1       0.82      0.14      0.24       100

    accuracy                           0.87       672
   macro avg       0.85      0.57      0.58       672
weighted avg       0.86      0.87      0.83       672

[[569   3]
 [ 86  14]]


**Nilai Precision dan meanscore terbaik pada model Logistic Regression, namun akan dicoba untuk Handling Imbalance terlebih dahulu untuk mendapatkan model dengan nilai stabil**

# **Handling Imbalance Dataset**

*Random Under Sampling*

In [26]:
rus = RandomUnderSampler(random_state = 2020)
X_under, y_under = rus.fit_resample(X_train, y_train) 

In [27]:
logreg_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('logreg', logreg)
])

tree_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('tree', tree)
])

knn_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('knn', knn)
])

rf_pipe_under = Pipeline([
    ('transformer', transformer),
    ('rus', rus),
    ('rf', rf)
])

In [28]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric) 
    return model_cv

logreg_under_cv = model_evaluation(logreg_pipe_under, 'precision') 
tree_under_cv = model_evaluation(tree_pipe_under, 'precision')
knn_under_cv = model_evaluation(knn_pipe_under, 'precision')
rf_under_cv = model_evaluation(rf_pipe_under, 'precision')

for model in [logreg_pipe_under, tree_pipe_under, knn_pipe_under, rf_pipe_under]:
    model.fit(X_train, y_train)

score_mean = [logreg_under_cv.mean(), tree_under_cv.mean(), knn_under_cv.mean(),
              rf_under_cv.mean()]
score_std = [logreg_under_cv.std(), tree_under_cv.std(), knn_under_cv.std(),
             rf_under_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_under.predict(X_test)),
            precision_score(y_test, tree_pipe_under.predict(X_test)), 
            precision_score(y_test, knn_pipe_under.predict(X_test)), 
            precision_score(y_test, rf_pipe_under.predict(X_test))]
method_name = ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling',
              'KNN Classifier UnderSampling', 'Random Forest Classifier UnderSampling']
under_result = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
under_result

Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression UnderSampling,0.266582,0.015287,0.286996
1,Decision Tree Classifier UnderSampling,0.219884,0.022217,0.212329
2,KNN Classifier UnderSampling,0.232455,0.014492,0.214035
3,Random Forest Classifier UnderSampling,0.256854,0.02425,0.283465


In [29]:
#cek confusion matrix dr model terbaik
logreg_pipe_under.fit(X_train, y_train)
ypred=logreg_pipe_under.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.92      0.72      0.81       572
           1       0.29      0.64      0.40       100

    accuracy                           0.71       672
   macro avg       0.60      0.68      0.60       672
weighted avg       0.83      0.71      0.75       672

[[413 159]
 [ 36  64]]


*Random Over Sampling*

In [30]:
ros = RandomOverSampler(random_state = 2020)
X_over, y_over = ros.fit_resample(X_train, y_train)

In [31]:
logreg_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('logreg', logreg)
])

tree_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('tree', tree)
])

knn_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros), 
    ('knn', knn)
])

rf_pipe_over = Pipeline([
    ('transformer', transformer),
    ('ros', ros),
    ('rf', rf)
])

In [32]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_over_cv = model_evaluation(logreg_pipe_over, 'precision') 
tree_over_cv = model_evaluation(tree_pipe_over, 'precision')
knn_over_cv = model_evaluation(knn_pipe_over, 'precision')
rf_over_cv = model_evaluation(rf_pipe_over, 'precision')

for model in [logreg_pipe_over, tree_pipe_over, knn_pipe_over, rf_pipe_over]:
    model.fit(X_train, y_train)

score_mean = [logreg_over_cv.mean(), tree_over_cv.mean(), knn_over_cv.mean(),
              rf_over_cv.mean()]
score_std = [logreg_over_cv.std(), tree_over_cv.std(), knn_over_cv.std(),
             rf_over_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_over.predict(X_test)),
            precision_score(y_test, tree_pipe_over.predict(X_test)), 
            precision_score(y_test, knn_pipe_over.predict(X_test)), 
            precision_score(y_test, rf_pipe_over.predict(X_test))]
method_name = ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling',
              'KNN Classifier OverSampling', 'Random Forest Classifier OverSampling']
over_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
over_summary


Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression OverSampling,0.274843,0.026533,0.296804
1,Decision Tree Classifier OverSampling,0.322504,0.046616,0.387755
2,KNN Classifier OverSampling,0.212357,0.014576,0.237209
3,Random Forest Classifier OverSampling,0.43178,0.025124,0.5


In [33]:
#cek confusion matrix dr model terbaik
rf_pipe_over.fit(X_train, y_train)
ypred=rf_pipe_over.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92       572
           1       0.50      0.29      0.37       100

    accuracy                           0.85       672
   macro avg       0.69      0.62      0.64       672
weighted avg       0.83      0.85      0.83       672

[[543  29]
 [ 71  29]]


*NearMiss*

In [34]:
nm = NearMiss(version = 1)

In [35]:
logreg_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('logreg', logreg)
])

tree_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('tree', tree)
])

knn_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('knn', knn)
])

rf_pipe_nm = Pipeline([
    ('transformer', transformer),
    ('nm', nm),
    ('rf', rf)
])

In [36]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_nm_cv = model_evaluation(logreg_pipe_nm, 'precision') 
tree_nm_cv = model_evaluation(tree_pipe_nm, 'precision')
knn_nm_cv = model_evaluation(knn_pipe_nm, 'precision')
rf_nm_cv = model_evaluation(rf_pipe_nm, 'precision')

for model in [logreg_pipe_nm, tree_pipe_nm, knn_pipe_nm, rf_pipe_nm]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_nm_cv.mean(), tree_nm_cv.mean(), knn_nm_cv.mean(),
              rf_nm_cv.mean()]
score_std = [logreg_nm_cv.std(), tree_nm_cv.std(), knn_nm_cv.std(),
             rf_nm_cv.std()]
score_precision_score = [precision_score(y_test, logreg_pipe_nm.predict(X_test)),
            precision_score(y_test, tree_pipe_nm.predict(X_test)), 
            precision_score(y_test, knn_pipe_nm.predict(X_test)), 
            precision_score(y_test, rf_pipe_nm.predict(X_test))]
method_name = ['Logistic Regression NearMiss', 'Decision Tree Classifier NearMiss',
              'KNN Classifier NearMiss', 'Random Forest Classifier NearMiss']
nm_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
nm_summary

Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression NearMiss,0.179701,0.019814,0.163978
1,Decision Tree Classifier NearMiss,0.165203,0.012071,0.179348
2,KNN Classifier NearMiss,0.196628,0.014884,0.202429
3,Random Forest Classifier NearMiss,0.175692,0.015804,0.176179


In [37]:
#cek confusion matrix dr model terbaik
knn_pipe_nm.fit(X_train, y_train)
ypred=knn_pipe_nm.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.88      0.66      0.75       572
           1       0.20      0.50      0.29       100

    accuracy                           0.63       672
   macro avg       0.54      0.58      0.52       672
weighted avg       0.78      0.63      0.68       672

[[375 197]
 [ 50  50]]


In [38]:
#Summary Balancing Dataset
resume_balancing = pd.concat([under_result,over_summary,nm_summary], axis=0)
resume_balancing

Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression UnderSampling,0.266582,0.015287,0.286996
1,Decision Tree Classifier UnderSampling,0.219884,0.022217,0.212329
2,KNN Classifier UnderSampling,0.232455,0.014492,0.214035
3,Random Forest Classifier UnderSampling,0.256854,0.02425,0.283465
0,Logistic Regression OverSampling,0.274843,0.026533,0.296804
1,Decision Tree Classifier OverSampling,0.322504,0.046616,0.387755
2,KNN Classifier OverSampling,0.212357,0.014576,0.237209
3,Random Forest Classifier OverSampling,0.43178,0.025124,0.5
0,Logistic Regression NearMiss,0.179701,0.019814,0.163978
1,Decision Tree Classifier NearMiss,0.165203,0.012071,0.179348


**setelah dilakukan balancing dataset ternyata nilai precision dan accuracy turun, dan Nilai precicion antara kelas 1 & 0 yang balance ada di model tanpa balancing dataset, sehingga digunakan model tanpa balancing dataset.Dan berdasarkan resume diatas model yang stabil adalah KNN dan Logistic regression. kemudian akan mencoba model Boosting**

# **BOOSTING**

In [39]:
adaboost = AdaBoostClassifier(
            tree,
            n_estimators = 50,
            learning_rate = 0.1,
            random_state = 2020)

pipe_ada = Pipeline([
    ('transformer', transformer),
    ('adaboost', adaboost)
])

gradboost = GradientBoostingClassifier(
            n_estimators = 50,
            learning_rate = 0.1,
            max_depth = 3,
            random_state = 2020)

pipe_grad = Pipeline([
    ('transformer', transformer),
    ('gradboost', gradboost)
])

XGBOOST = XGBClassifier(
            n_estimators = 50,
            learning_rate = 0.1,
            max_depth = 3,
            random_state = 2020)

pipe_XGB = Pipeline([
    ('transformer', transformer),
    ('XGBOOST', XGBOOST)
])

In [40]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric, n_jobs = -1)
    return model_cv

pipe_ada_cv = model_evaluation(pipe_ada, 'precision')
pipe_grad_cv = model_evaluation(pipe_grad, 'precision')
pipe_XGB_cv = model_evaluation(pipe_XGB, 'precision')

for model in [pipe_ada, pipe_grad, pipe_XGB]:
    model.fit(X_train, y_train)
    
score_mean = [pipe_ada_cv.mean(), pipe_grad_cv.mean(), pipe_XGB_cv.mean()]
score_std = [pipe_ada_cv.std(), pipe_grad_cv.std(), pipe_XGB_cv.std()]
score_precision_score = [precision_score(y_test, pipe_ada.predict(X_test)),
            precision_score(y_test, pipe_grad.predict(X_test)), 
            precision_score(y_test, pipe_XGB.predict(X_test))]
method_name = ['Ada Boost Classifier', 'Gradient Boost Classifier',
              'XGB Classifier']
boost_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'precision score': score_precision_score
})
boost_summary



Unnamed: 0,method,mean score,std score,precision score
0,Ada Boost Classifier,0.320199,0.057693,0.363636
1,Gradient Boost Classifier,0.542337,0.126316,0.714286
2,XGB Classifier,0.554471,0.125104,0.772727


In [41]:
resume_model = pd.concat([cv_result,boost_summary], axis=0)
resume_model

Unnamed: 0,method,mean score,std score,precision score
0,Logistic Regression,0.528766,0.199787,0.823529
1,Decision Tree Classifier,0.252752,0.064072,0.407407
2,KNN Classifier,0.369554,0.057294,0.466667
3,Random Forest Classifier,0.513016,0.09092,0.724138
0,Ada Boost Classifier,0.320199,0.057693,0.363636
1,Gradient Boost Classifier,0.542337,0.126316,0.714286
2,XGB Classifier,0.554471,0.125104,0.772727


In [42]:
#cek confusion matrix dr model Logreg
logreg_pipe.fit(X_train, y_train)
ypred=logreg_pipe.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       572
           1       0.82      0.14      0.24       100

    accuracy                           0.87       672
   macro avg       0.85      0.57      0.58       672
weighted avg       0.86      0.87      0.83       672

[[569   3]
 [ 86  14]]


In [43]:
#cek confusion matrix dr model XGB boost(terbaik kedua)
pipe_XGB.fit(X_train, y_train)
ypred=pipe_XGB.predict(X_test)
print(classification_report(y_test,ypred))
print(metrics.confusion_matrix(y_test,ypred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       572
           1       0.77      0.17      0.28       100

    accuracy                           0.87       672
   macro avg       0.82      0.58      0.60       672
weighted avg       0.86      0.87      0.83       672

[[567   5]
 [ 83  17]]


**Berdasarkan resume diatas maka akan digunakan Logistic Regresi, karena memiliki nilai precision tertinggi dan nilai precision antara kelas 1 dan 0 tidak beda jauh. kemudian akan lanjut ke proses hyperparameter tunning**

# Hyperparameter Tunning


In [44]:
logreg=LogisticRegression()

estimator = Pipeline([
    ('transformer', transformer),
    ('model', logreg)
])

hyperparam_space =  {
    'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'model__solver': ['liblinear', 'newton-cg']
}

In [45]:
grid_search = GridSearchCV(
                estimator,
                param_grid = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'precision',
                n_jobs = -1)

In [46]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('impute',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer()),
                                                                                         ('scaling',
                                                                                          RobustScaler())]),
                                                                         ['Income']),
                                                                        ('encoder',
                                                                         OneHotEncoder(handle_unknown='ignore'),

In [47]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

best score 0.599047619047619
best param {'model__C': 0.1, 'model__solver': 'newton-cg'}


In [48]:
logreg_pipe.fit(X_train, y_train)
y_pred_estimator = logreg_pipe.predict(X_test)
precision_estimator = precision_score(y_test, y_pred_estimator)

grid_search.best_estimator_.fit(X_train, y_train)
y_pred_grid = grid_search.best_estimator_.predict(X_test)
precision_best_estimator = precision_score(y_test, y_pred_grid)

score_list = [precision_estimator, precision_best_estimator]
method_name = ['Logistic Regression Before', 'Logistic Regression After']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

Unnamed: 0,method,score
0,Logistic Regression Before,0.823529
1,Logistic Regression After,0.666667


**Nilai precision score sebelum dan sesudah tunning lebih baik sebelum tunning, sehingga akan diambil model tanpa tunning**

### Saving Model

In [68]:
marketing= data

In [69]:
marketing.columns

Index(['Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Recency', 'Response', 'Country', 'Customer_Age'],
      dtype='object')

In [70]:
marketing.to_csv('marketing.csv')

In [71]:
X=marketing.drop(['Response'],axis=1)
y=marketing['Response']

In [72]:
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.3,random_state=2020)

In [73]:
model=LogisticRegression()

mean_scale = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('scaling', RobustScaler()),
])

transformer = ColumnTransformer([
    ('impute',mean_scale,['Income']),
    ('encoder',OneHotEncoder(handle_unknown='ignore'),['Education','Marital_Status']),
    ('binary',ce.BinaryEncoder(),['Country']),
    ('scale',RobustScaler(),['Customer_Age','Recency'])
],remainder='passthrough')


model_final = Pipeline([
    ('transformer',transformer),
    ('model', model)
])


In [74]:
model_final.fit(X_train, y_train)
y_pred_estimator = model_final.predict(X_test)
print(precision_score(y_test, y_pred_estimator))

0.8235294117647058


In [75]:
model_final.fit(X,y) #Final model, fit ke X dan y untuk memaksimalkan jumlh data yg diplajari oleh model

#with pickle
file_name='Model Final.sav'
pickle.dump(model_final,open(file_name,'wb'))

**Predict with Saved Model**

In [76]:
loaded_model = pickle.load(open(file_name,'rb'))

In [77]:
loaded_model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [78]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Education       2240 non-null   object 
 1   Marital_Status  2240 non-null   object 
 2   Income          2216 non-null   float64
 3   Kidhome         2240 non-null   int64  
 4   Teenhome        2240 non-null   int64  
 5   Recency         2240 non-null   int64  
 6   Country         2240 non-null   object 
 7   Customer_Age    2240 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 140.1+ KB


In [79]:
df_predict=pd.DataFrame({
    'Education':['Master'],
    'Marital_Status':['Single'],
    'Income':[21888],
    'Kidhome':[1],
    'Teenhome':[0],
    'Recency':[15],
    'Country':['ME'],
    'Customer_Age':[30]
})

In [80]:
loaded_model.predict(df_predict)

array([1], dtype=int64)

In [81]:
loaded_model.predict_proba(df_predict)

array([[0.46744927, 0.53255073]])

In [83]:
data['Country'].unique()

array(['SP', 'CA', 'US', 'AUS', 'GER', 'IND', 'SA', 'ME'], dtype=object)