In [1]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import balanced_accuracy_score
from collections import Counter
from imblearn.over_sampling import SMOTE

%matplotlib inline

In [2]:
# load data
df = pd.read_excel("Dataset.xlsx")

In [3]:
# look at data
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,AcceptedCmp6,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,0,0,0,0,3,11,1,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,0,0,0,0,3,11,0,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,0,0,0,0,0,0,3,11,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,0,0,0,0,0,0,3,11,0,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,0,0,0,0,0,0,3,11,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,0,0,0,0,0,0,3,11,0,0
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,0,0,0,1,0,0,3,11,0,1
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,0,1,0,0,0,0,3,11,0,1
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,0,0,0,0,0,0,3,11,0,0


In [4]:
# remove campaign variables so that response is our only target varaiable
df = df.drop(columns=["AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5", "AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp6"])

In [5]:
# view data to ensure changes are correct
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,88,3,8,10,4,7,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,6,2,1,1,2,5,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,42,1,8,2,10,4,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,5,2,2,0,4,6,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,15,5,5,3,6,5,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,247,2,9,3,4,5,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,8,7,8,2,5,7,0,3,11,1
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,24,1,2,3,13,6,0,3,11,1
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,61,2,6,5,10,3,0,3,11,0


In [6]:
# Complain, Z_CostContact, and Z_Revenue all have the same value for each record. we will remove those as well
df = df.drop(columns=["Complain", "Z_CostContact", "Z_Revenue"])

In [7]:
# view data to ensure changes are correct
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,546,172,88,88,3,8,10,4,7,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,6,2,1,6,2,1,1,2,5,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,127,111,21,42,1,8,2,10,4,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,20,10,3,5,2,2,0,4,6,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,118,46,27,15,5,5,3,6,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,182,42,118,247,2,9,3,4,5,0
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,30,0,0,8,7,8,2,5,7,1
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,217,32,12,24,1,2,3,13,6,1
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,214,80,30,61,2,6,5,10,3,0


In [8]:
# drop ID column
df = df.drop(columns="ID")

In [9]:
# view data to ensure changes are correct
df

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response
0,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,1
1,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0
2,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0
3,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0
4,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,43,182,42,118,247,2,9,3,4,5,0
2236,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,0,30,0,0,8,7,8,2,5,7,1
2237,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,48,217,32,12,24,1,2,3,13,6,1
2238,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,30,214,80,30,61,2,6,5,10,3,0


In [10]:
# create age column
df['Age'] =  2022 - df['Year_Birth']
# remove Year_Birth column
df = df.drop(columns="Year_Birth")

In [11]:
# view data to ensure changes are correct
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age
0,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,1,65
1,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,68
2,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,57
3,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,38
4,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,Graduation,Married,61223.0,0,1,2013-06-13,46,709,43,182,42,118,247,2,9,3,4,5,0,55
2236,PhD,Together,64014.0,2,1,2014-06-10,56,406,0,30,0,0,8,7,8,2,5,7,1,76
2237,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,48,217,32,12,24,1,2,3,13,6,1,41
2238,Master,Together,69245.0,0,1,2014-01-24,8,428,30,214,80,30,61,2,6,5,10,3,0,66


In [12]:
# encoding the education 

labels = df['Education'].astype('category').cat.categories.tolist()
edu = {'Education' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(edu)

{'Education': {'2n Cycle': 1, 'Basic': 2, 'Graduation': 3, 'Master': 4, 'PhD': 5}}


In [13]:
df.replace(edu, inplace=True)
display(df['Education'].unique())

array([3, 5, 4, 2, 1])

In [14]:
# view data to ensure changes are correct
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age
0,3,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,1,65
1,3,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,68
2,3,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,57
3,3,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,38
4,5,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,Married,61223.0,0,1,2013-06-13,46,709,43,182,42,118,247,2,9,3,4,5,0,55
2236,5,Together,64014.0,2,1,2014-06-10,56,406,0,30,0,0,8,7,8,2,5,7,1,76
2237,3,Divorced,56981.0,0,0,2014-01-25,91,908,48,217,32,12,24,1,2,3,13,6,1,41
2238,4,Together,69245.0,0,1,2014-01-24,8,428,30,214,80,30,61,2,6,5,10,3,0,66


In [15]:
labels = df['Marital_Status'].astype('category').cat.categories.tolist()
ms = {'Marital_Status' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

df.replace(ms, inplace=True)

In [16]:
# view data to ensure changes are correct
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age
0,3,5,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,1,65
1,3,5,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,68
2,3,6,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,57
3,3,6,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,38
4,5,4,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,2013-06-13,46,709,43,182,42,118,247,2,9,3,4,5,0,55
2236,5,6,64014.0,2,1,2014-06-10,56,406,0,30,0,0,8,7,8,2,5,7,1,76
2237,3,3,56981.0,0,0,2014-01-25,91,908,48,217,32,12,24,1,2,3,13,6,1,41
2238,4,6,69245.0,0,1,2014-01-24,8,428,30,214,80,30,61,2,6,5,10,3,0,66


In [17]:
# Dt_Customer variable is a date format, this could be more useful if it was an integer representing how long
# a customer has shopped with the company
df.dtypes

Education                       int64
Marital_Status                  int64
Income                        float64
Kidhome                         int64
Teenhome                        int64
Dt_Customer            datetime64[ns]
Recency                         int64
MntWines                        int64
MntFruits                       int64
MntMeatProducts                 int64
MntFishProducts                 int64
MntSweetProducts                int64
MntGoldProds                    int64
NumDealsPurchases               int64
NumWebPurchases                 int64
NumCatalogPurchases             int64
NumStorePurchases               int64
NumWebVisitsMonth               int64
Response                        int64
Age                             int64
dtype: object

In [18]:
# define the current date used to calculate how long user has been a customer
from datetime import date
currDate = np.datetime64("2022-04-26")

In [19]:
# print current date
currDate

numpy.datetime64('2022-04-26')

In [20]:
# subtract Dt_Customer from currDate to get time delta
delta = currDate - df["Dt_Customer"]

In [21]:
# check data
delta

0      3521 days
1      2971 days
2      3170 days
3      2997 days
4      3019 days
          ...   
2235   3239 days
2236   2877 days
2237   3013 days
2238   3014 days
2239   3480 days
Name: Dt_Customer, Length: 2240, dtype: timedelta64[ns]

In [22]:
# convert to int
delta.dt.days

0       3521
1       2971
2       3170
3       2997
4       3019
        ... 
2235    3239
2236    2877
2237    3013
2238    3014
2239    3480
Name: Dt_Customer, Length: 2240, dtype: int64

In [23]:
# add timedelta to df
df["Days_as_Customer"] = delta.dt.days

In [24]:
# check df
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,...,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age,Days_as_Customer
0,3,5,58138.0,0,0,2012-09-04,58,635,88,546,...,88,88,3,8,10,4,7,1,65,3521
1,3,5,46344.0,1,1,2014-03-08,38,11,1,6,...,1,6,2,1,1,2,5,0,68,2971
2,3,6,71613.0,0,0,2013-08-21,26,426,49,127,...,21,42,1,8,2,10,4,0,57,3170
3,3,6,26646.0,1,0,2014-02-10,26,11,4,20,...,3,5,2,2,0,4,6,0,38,2997
4,5,4,58293.0,1,0,2014-01-19,94,173,43,118,...,27,15,5,5,3,6,5,0,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,2013-06-13,46,709,43,182,...,118,247,2,9,3,4,5,0,55,3239
2236,5,6,64014.0,2,1,2014-06-10,56,406,0,30,...,0,8,7,8,2,5,7,1,76,2877
2237,3,3,56981.0,0,0,2014-01-25,91,908,48,217,...,12,24,1,2,3,13,6,1,41,3013
2238,4,6,69245.0,0,1,2014-01-24,8,428,30,214,...,30,61,2,6,5,10,3,0,66,3014


In [25]:
# drop Dt_Customer
df = df.drop(columns="Dt_Customer")

In [26]:
# check df
df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,1,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,0,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,0,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,0,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,0,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,0,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,1,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,1,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,0,66,3014


In [27]:
# calculate IQR
q1=df["Income"].quantile(0.25)
q3=df["Income"].quantile(0.75)
iqr=q3-q1

In [28]:
# get locations of outliers in Income
np.where(df["Income"] > (q3 + 1.5*iqr))

(array([ 164,  617,  655,  687, 1300, 1653, 2132, 2233]),)

In [29]:
# drop income outlier records
test_df = df.drop([164,  617,  655,  687, 1300, 1653, 2132, 2233])

In [30]:
# check new test_df
test_df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,1,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,0,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,0,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,0,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,0,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,0,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,1,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,1,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,0,66,3014


In [31]:
# get list of column names
column_names = list(test_df.columns)

In [32]:
test_df

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,1,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,0,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,0,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,0,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,0,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,0,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,1,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,1,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,0,66,3014


In [33]:
# split data into data and target data
X = test_df.iloc[:, 0:17]

age = test_df["Age"]
days = test_df["Days_as_Customer"]
X["Age"] = age
X["Days_as_Customer"] = days

T = test_df.iloc[:, -3]

In [34]:
# data
X

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,66,3014


In [35]:
# target
T

0       1
1       0
2       0
3       0
4       0
       ..
2235    0
2236    1
2237    1
2238    0
2239    1
Name: Response, Length: 2232, dtype: int64

In [36]:
# check for nan values
np.where(np.isnan(test_df))

(array([  10,   27,   43,   48,   58,   71,   90,   91,   92,  128,  133,
         311,  318, 1374, 1377, 1378, 1381, 2053, 2055, 2072, 2073, 2075,
        2078, 2221]),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2]))

In [37]:
# drop nan values
test_dfv2 = test_df.dropna()

In [38]:
test_dfv2

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,1,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,0,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,0,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,0,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,0,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,0,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,1,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,1,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,0,66,3014


In [39]:
# split data
X = test_dfv2.iloc[:, 0:17]

age = test_dfv2["Age"]
days = test_dfv2["Days_as_Customer"]
X["Age"] = age
X["Days_as_Customer"] = days

T = test_dfv2.iloc[:, -3]

In [40]:
# explore X
X

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Age,Days_as_Customer
0,3,5,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,65,3521
1,3,5,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,68,2971
2,3,6,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,57,3170
3,3,6,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,38,2997
4,5,4,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,4,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,55,3239
2236,5,6,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,76,2877
2237,3,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,41,3013
2238,4,6,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,66,3014


In [41]:
# explore T
T

0       1
1       0
2       0
3       0
4       0
       ..
2235    0
2236    1
2237    1
2238    0
2239    1
Name: Response, Length: 2208, dtype: int64

In [42]:
# perform chi2 test
from sklearn.feature_selection import chi2
scores, pvalues = chi2(X, T)
pvalues

array([3.52486832e-002, 9.29230600e-001, 0.00000000e+000, 2.20625346e-010,
       3.32972735e-006, 2.83398763e-088, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       6.65286246e-002, 4.86413228e-044, 6.31210038e-154, 1.13394274e-020,
       3.73886803e-004, 1.33321401e-002, 3.94109264e-046])

In [43]:
# remove marital status
X = X.drop(columns="Marital_Status")

In [44]:
X

Unnamed: 0,Education,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Age,Days_as_Customer
0,3,58138.0,0,0,58,635,88,546,172,88,88,3,8,10,4,7,65,3521
1,3,46344.0,1,1,38,11,1,6,2,1,6,2,1,1,2,5,68,2971
2,3,71613.0,0,0,26,426,49,127,111,21,42,1,8,2,10,4,57,3170
3,3,26646.0,1,0,26,11,4,20,10,3,5,2,2,0,4,6,38,2997
4,5,58293.0,1,0,94,173,43,118,46,27,15,5,5,3,6,5,41,3019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,3,61223.0,0,1,46,709,43,182,42,118,247,2,9,3,4,5,55,3239
2236,5,64014.0,2,1,56,406,0,30,0,0,8,7,8,2,5,7,76,2877
2237,3,56981.0,0,0,91,908,48,217,32,12,24,1,2,3,13,6,41,3013
2238,4,69245.0,0,1,8,428,30,214,80,30,61,2,6,5,10,3,66,3014


In [45]:
# resample data using SMOTE
sm = SMOTE(random_state=42)
X_res, T_res = sm.fit_resample(X, T)

In [48]:
# split data
X_train, X_test, t_train, t_test = train_test_split(X_res, T_res, test_size=0.2)

In [51]:
# logistic regression with standard scaler
clf = Pipeline([('scaler', StandardScaler()), 
                ('classifier', LogisticRegression(random_state=0))])
clf.fit(X_train, t_train)
pred = clf.predict(X_test)
precision = precision_score(t_test, pred)
recall = recall_score(t_test, pred)
print(f"Test Score: {clf.score(X_test, t_test)}")
print(f"Train Score: {clf.score(X_train, t_train)}")
print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(t_test, pred)}")

Test Score: 0.764797507788162
Train Score: 0.7644305772230889
Precision Score: 0.8273381294964028
Recall Score: 0.6906906906906907
Balanced Accuracy Score: 0.7676754424327239


In [52]:
# naive bayes with standard scaler
clf = Pipeline([('scaler', StandardScaler()), 
                ('classifier', GaussianNB())])
clf.fit(X_train, t_train)
pred = clf.predict(X_test)
precision = precision_score(t_test, pred)
recall = recall_score(t_test, pred)
print(f"Test Score: {clf.score(X_test, t_test)}")
print(f"Train Score: {clf.score(X_train, t_train)}")
print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(t_test, pred)}")

Test Score: 0.661993769470405
Train Score: 0.6692667706708268
Precision Score: 0.7148148148148148
Recall Score: 0.5795795795795796
Balanced Accuracy Score: 0.6651943205341264


In [53]:
# ridge with standard scaler
clf = Pipeline([('scaler', StandardScaler()), 
                ('classifier', RidgeClassifier())])
clf.fit(X_train, t_train)
pred = clf.predict(X_test)
precision = precision_score(t_test, pred)
recall = recall_score(t_test, pred)
print(f"Test Score: {clf.score(X_test, t_test)}")
print(f"Train Score: {clf.score(X_train, t_train)}")
print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(t_test, pred)}")

Test Score: 0.7663551401869159
Train Score: 0.7531201248049922
Precision Score: 0.8452830188679246
Recall Score: 0.6726726726726727
Balanced Accuracy Score: 0.769993294265139


In [54]:
# svc with standard scaler
clf = Pipeline([('scaler', StandardScaler()), 
                ('classifier', SVC(kernel='rbf', tol=1e-3, random_state=0))])
clf.fit(X_train, t_train)
pred = clf.predict(X_test)
precision = precision_score(t_test, pred)
recall = recall_score(t_test, pred)
print(f"Test Score: {clf.score(X_test, t_test)}")
print(f"Train Score: {clf.score(X_train, t_train)}")
print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(t_test, pred)}")

Test Score: 0.8208722741433022
Train Score: 0.8677847113884556
Precision Score: 0.8562091503267973
Recall Score: 0.7867867867867868
Balanced Accuracy Score: 0.8221959823901572
