In [None]:
# Importing the Libraries

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as mpl
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#pip install cufflinks
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading the dataset
main_df = pd.read_csv('marketing_campaign.csv', delimiter = '\t')
custdata = main_df.copy()
custdata.head()

# Data Preparation
## 1. Data Cleaning

In [None]:
#Information on features 
custdata.info()

In [None]:
custdata.isna().any()

In [None]:
# Remove the NA values in income attribute
custdata = custdata.dropna()
custdata = custdata.dropna().reset_index(drop = True)
print("The total number of data-points after removing the rows with missing values are:", len(custdata))

In [None]:
# Age of customers
import datetime
for i in range(2216):
    #transform Dt_customer to standard timestamp
    custdata['Dt_Customer'][i] = datetime.datetime.strptime(str(custdata['Dt_Customer'][i]), "%d-%m-%Y").strftime("%Y-%m-%d") 
    # access the YYYY of timestamp
    custdata['Dt_Customer'][i] = int(str(custdata['Dt_Customer'][i]).split('-')[0]) 
    
custdata["Age"] = custdata["Dt_Customer"] - custdata["Year_Birth"]
custdata["Age"] = custdata["Age"].astype(int)
custdata.info()

In [None]:
custdata.describe()

**1. Age**

# Drop outliers of age by calculating Z-Score

from scipy import stats

custdata["z_value_age"] = np.abs(stats.zscore(custdata["Age"]))
custdata["z_value_age"]

In [None]:
threshold = 3
z1  = np.abs(stats.zscore(custdata["Age"]))
np.where(z1>3)

In [None]:
custdata.iloc[np.where(z1>3)]

In [None]:
# Remove rows with z_value_age > 3
custdata = custdata.drop(custdata[custdata.z_value_age > 3].index)

# Reset the index to [0:2212]
custdata = custdata.reset_index(drop=True)
custdata.shape

**2. Income**

In [None]:
# Drop outliers of income by calculating Z-Score
custdata["z_value_income"] = np.abs(stats.zscore(custdata["Income"]))
custdata["z_value_income"]

In [None]:
threshold = 3
z2  = np.abs(stats.zscore(custdata["Income"]))
np.where(z2>3)

In [None]:
custdata.iloc[np.where(z2>3)]

Clearly, the income of 150k~160k is customers with very high income, however, there is one customer whose income is 666666.0 should be an outlier.

In [None]:
# Remove rows with z_value_age > 5
custdata = custdata.drop(custdata[custdata.z_value_income > 5].index)

# Reset the index to [0:2212]
custdata = custdata.reset_index(drop=True)
custdata.shape

In [None]:
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sb.set(style="darkgrid")
 
# creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)},)
 
# assigning a graph to each ax
sb.boxplot(custdata["Age"], ax=ax_box)
sb.histplot(data=custdata, x="Age", ax=ax_hist)
 
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
plt.show()

In [None]:
# Checking for correlation by unstacking data

corr = custdata.corr()
c1 = corr.abs().unstack()
c1.sort_values(ascending = False)[24:50:2]

## 2. Feature Engineering

In [None]:
#Total spendings on various items
custdata["Spent"] = custdata["MntWines"]+ custdata["MntFruits"]+ custdata["MntMeatProducts"]+ custdata["MntFishProducts"]+ custdata["MntSweetProducts"]+ custdata["MntGoldProds"]

#Deriving living situation by marital status"Alone"
custdata["Living_With"]=custdata["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone",})

#Feature indicating total children living in the household
custdata["Children"]=custdata["Kidhome"]+custdata["Teenhome"]

#Feature for total members in the householde
custdata["Family_Size"] = custdata["Living_With"].replace({"Alone": 1, "Partner":2})+ custdata["Children"]

#Feature pertaining parenthood
custdata["Is_Parent"] = np.where(custdata.Children> 0, 1, 0)

#Segmenting education levels in three groups
custdata["Education"]=custdata["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})

#Dropping some of the redundant features
to_drop = ["ID", "Year_Birth", "Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "z_value_age", "z_value_income"]
custdata = custdata.drop(to_drop, axis=1)

In [None]:
custdata.info()