### 2. Data Cleaning & Wrangling

#### 2.1. Initial review - data shape, type & content

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# We noticed the data doesn't have column names, so we'll read it without adding
# headers and add the column names afterwards

data = pd.read_csv('data/creditcardmarketing.csv', header=None)

columns = ['customer_number', 'offer_accepted', 'reward', 'mailer_type', \
    'income_level', 'bank_accounts_open', 'overdraft_protection', \
    'credit_rating', 'credit_cards_held', 'homes_owned', 'household_size', \
    'home_owner', 'average_balance', 'balance_Q1', 'balance_Q2', 'balance_Q3', \
    'balance_Q4']

data.columns = columns

In [None]:
# Check characteristics of the data
data.shape

In [None]:
data.dtypes

In [None]:
data.info()

We can already see that in the balance columns there are 24 null values.

In [None]:
round(data.describe())

In [None]:
data.head(5)

We notice the `customer_column` which we can use as our index column after ensuring it doesn't have any duplicate values:

In [None]:
print(len(data['customer_number'].unique())/data.shape[0]) # equal to 1, so no duplicates

In [None]:
data.set_index('customer_number', inplace=True, drop=True)

In [None]:
# Check the unique values in the categorical columns:
cat = data.select_dtypes(object)

for col in cat.columns:
    print(col, ":", cat[col].unique()) # No odd entries, so no cleaning required

In [None]:
# Check the unique values in the discrete numerical columns

# Split the numerical data into discrete & continuous data
def split_numericals(data, threshold=10):
    num = data.select_dtypes(np.number)
    cont_columns = []
    disc_columns = []
    for col in num.columns:
        if len(num[col].unique()) > threshold:
            cont_columns.append(col)
        else:
            disc_columns.append(col)
    num_cont = data.loc[:, cont_columns]
    num_disc = data.loc[:, disc_columns]
    return num_cont, num_disc

num_cont, num_disc = split_numericals(data)

# Check the unique values
for col in num_disc.columns:
    print(col, ":", num_disc[col].unique()) # No odd entries, so no cleaning required

#### 2.2. Data Cleaning

As the data set is already relatively clean and the data types are correct, we only need to handle the null values in the balance columns. We will start by checking the maximum percentage of nulls within a row, to decide whether or not it would be more beneficial to remove them from the dataset.

In [None]:
def max_nulls_rows(df):
    nulls_percentage = []
    for index in df.index:
        number_of_nulls = df.loc[index,].isna().sum()
        null_percentage = round(number_of_nulls * 100 / df.shape[1], 1)
        nulls_percentage.append(null_percentage)
    return max(nulls_percentage)

max_nulls_rows(data)

Given some rows have 31% null values, it might be better to drop them:

In [None]:
data = data.dropna()

In [None]:
data.shape # We can see we dropped the customers with NaN values across \
           # all balance columns

### 3. Exploratory Data Analysis

#### 3.1. Categorical & discrete numerical columns

In [None]:
# Check number of plots
print("cat column number:", cat.shape[1])
print("num_disc column number:", num_disc.shape[1])

In [None]:
def column_plotter(cat_or_num_disc, plot_type=sns.countplot):
    col_number = len(cat_or_num_disc.columns)
    for i in range(0, col_number, 2):
        column_1 = cat_or_num_disc.columns[i]
        try:
            column_2 = cat_or_num_disc.columns[i + 1]
            fig, ax = plt.subplots(1, 2, figsize=(12, 4))        
            plot_type(x=column_2, data=cat_or_num_disc, ax=ax[1], color='b')
            plot_type(x=column_1, data=cat_or_num_disc, ax=ax[0], color='c')        
        except:
            fig, ax = plt.subplots(1, 1, figsize=(6, 4))  
            plot_type(x=column_1, data=cat_or_num_disc, color='c')              
        plt.show()

In [None]:
column_plotter(cat)

We notice that the `offer_accepted`, `overdraft_protection`, and `home_owner` are imbalanced.

In [None]:
column_plotter(num_disc)

We notice we have a very small representation of customers that:
- have 3 open bank accounts
* have 4 credit card accounts
- own 3 houses
* belong to a household of more than 6 people

#### 3.2. Numerical continuous data

In [None]:
column_plotter(num_cont, plot_type=sns.histplot)

We note that the average, Q2 & Q3 balances have relatively normal distributions, whereas the Q1 & Q4 balances generally tend to be on the lower side.

In [None]:
column_plotter(num_cont, plot_type=sns.boxplot)

Check how the `average_balance` is distributed depending on various categories:

In [None]:
for col in cat.columns:
    sns.boxplot(x='average_balance', y=col, data=data)
    plt.show()

In [None]:
for col in num_disc.columns:
    sns.boxplot(x=col, y='average_balance', data=data)
    plt.show()

We notice that the `average_balance` has a similar distribution across all categorical & discrete numerical attributes, except for `household_size` (n=8, 9), where there were only two data points present.

We can also look at how categorical & discrete numerical features correlate to each other:

In [None]:
offer_v_home_owner = pd.crosstab(data['offer_accepted'], data['home_owner'])
print(offer_v_home_owner)

In [None]:
offer_v_credit_rating = pd.crosstab(data['offer_accepted'], data['credit_rating'])
print(offer_v_credit_rating)

In [None]:
# Calculate correlation matrix
data.corr()

# Create heatmap
mask = np.zeros_like(data.corr())
mask[np.triu_indices_from(mask)] = True 
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(data.corr(), mask=mask, annot=True)
plt.show()

We notice that the `average_balance` is highly correlated with all the other balances (which is to be expected), whereas the quarterly balance is highly correlated with the previous quarter's balance, which is again to be expected. In further modelling, we might want to use the `average_balance` rather than all the quarterly balances.

In [None]:
# Save clean dataframe to a new .csv file to be used in further analysis
data.to_csv('creditcardmarketing_post_cleaning.csv')