In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import shap
import random
import datetime
import numpy as np
import pandas as pd  
import seaborn as sns
from sklearn.svm import SVC
from math import floor, ceil
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.dates as mdates
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.metrics import plot_confusion_matrix, classification_report, plot_precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold

In [None]:
# General setup
#plt.style.use('fivethirtyeight')
plt.style.use('fivethirtyeight')

plt.rcParams.update({'figure.figsize': (8, 4)})

# To be able to reproduce results
random.seed(42)

# Today variable, useful for calculating ages
today = datetime.datetime.now()

In [None]:
# Create an array with the colors you want to use
colors = ['#004792', '#AFC4DB']
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

In [None]:
# MATPLOTLIB CONFIGS
plt.rcParams['axes.facecolor']='white'
plt.rcParams['figure.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'

In [None]:
default_color='#004792'

# Helper Functions


In [None]:
def corr_plot(df, figsize=(11, 9), cmap='Blues'):
    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=figsize)

    # Generate a custom diverging colormap
    #cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                square=False, linewidths=.5, cbar_kws={"shrink": .7}, annot=True, fmt=".1%")

    plt.xticks(rotation=60)
    plt.tight_layout()

def plot_cluster_var_comparison(df, exclude, cluster_var='Cluster'):
    if cluster_var not in exclude:
        exclude.append(cluster_var)

    columns = df.columns.difference(exclude)

    var_combinations = list(combinations(columns, 2))

    rows = ceil(len(var_combinations)/2)

    fig, axes = plt.subplots(rows, 2, figsize=(14, rows*5))

    axes = axes.ravel()

    axis_to_off = [i-1 for i in list(range(rows*2, len(var_combinations), -1))]

    for combination, ax in zip(var_combinations, axes):
        x_axis = combination[0]
        y_axis = combination[1]

        sns.scatterplot(data=df, x=x_axis, y=y_axis, hue=cluster_var,
                        legend='brief', ax=ax, s=70)

    for ax_i in axis_to_off:
        axes[ax_i].axis('off')

    plt.tight_layout()

def drop_highly_correlated(df, threshold=.95):
    init_shape = df.shape
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(
        upper[column] > threshold)]

    # Drop features
    df = df.drop(df[to_drop], axis=1)

    print(f'Dropping columns: {to_drop}')
    print(f'Initial df Shape: {init_shape}\nFinal df Shape: {df.shape}')

    return df

# Create a function for computing and plotting the ECDF with default parameters
def plot_ecdf(data, ax, xlabel='Data Values', ylabel='FDEC', color='#FF3030'):
    """ 
    Function to plot ecdf taking a column of data as input.
    """
    # ECF cacl
    xaxis = np.sort(data)
    yaxis = np.arange(1, len(data)+1)/len(data)

    ax.plot(xaxis, yaxis, linestyle='none', marker='.', color=color)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    plt.margins(0.02)

def plot_dist(df, columns, hist=True, ecdf=True, figsize=(8, 4), color=default_color):
    """
    Plots histogram and ecdf plots for columns passed
    """
    for col in columns:
        f, ax = plt.subplots(1, 2, figsize=figsize)
        plt.suptitle(f'Histograma e FDEC da variável {col}', y=1.035)
        if hist:
            sns.distplot(df[col], ax=ax[0], color=color)
        if ecdf:
            plot_ecdf(df[col], ax=ax[1], xlabel=col, color=color)
        plt.tight_layout()
        plt.show()

def accpt_bar_plot(df, column, title='', xlabel='', figsize=(10, 5), ylim_offset=5, ax=None):
    df_bar = (df.groupby(['accpt', column])[
              'ID'].nunique().to_frame('perc_customer')/df.shape[0])*100
    df_bar = df_bar.reset_index().sort_values(by=column, ascending=True)
    
    
    plt.figure(figsize=figsize)
    
    ax = sns.barplot(x=column, y='perc_customer', hue='accpt',
                     hue_order=["Não aceitou nenhuma campanha",
                                'Aceitou pelo menos uma campanha'],
                     data=df_bar, alpha=1, saturation=1, edgecolor='k', linewidth=.7)
    
    
    # Add this loop to add the annotations
    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy()
        ax.annotate('%.1f' % height + '%', (x + width /
                                            1.8, y + height*1.01), ha='center')

    ax.xaxis.set_tick_params(labelsize=15)
    ax.yaxis.set_tick_params(labelsize=15)
    plt.legend(fontsize='medium', loc='best', title='')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('% de clientes')
    plt.ylim(0, df_bar.perc_customer.max() + ylim_offset)
    plt.tight_layout()
    plt.savefig(f'./images/{column}_accpt.png', dpi=300)
    
    #plt.show()
# f, ax = plt.subplots(2, 2, figsize=(16, 8), constrained_layout=True)
# plt.grid(False)
# accpt_bar_plot(df, 'Education', xlabel='Escolaridade', ax=ax[0][1])
# accpt_bar_plot(df, 'Marital_Status', xlabel='Estado civil', ylim_offset=5, ax=ax[0][0])

# accpt_bar_plot(df, 'AgeGroup', xlabel='Faixa etária', ylim_offset=10, ax=ax[1][0])
# accpt_bar_plot(df, 'Kidhome', xlabel='Número de crianças em casa', ylim_offset=5, ax=ax[1][1])

# plt.savefig('images/com_profile.png', dpi=300, bbox_inches='tight')
#plt.tight_layout()

# f, ax = plt.subplots(1, 3, figsize=(18, 4.5), constrained_layout=True)
# plt.grid(False)

# accpt_bar_plot(df, 'r_score', title='Recency', xlabel='Score', ylim_offset=8, ax=ax[0])
# accpt_bar_plot(df, 'f_score', title='Frequency', xlabel='Score', ylim_offset=8, ax=ax[1])
# accpt_bar_plot(df, 'm_score', title='Monetary', xlabel='Score', ylim_offset=8, ax=ax[2])

plt.savefig('images/com_rfm.png', dpi=300, bbox_inches='tight')
    
def campaign_success(df_, index):
    custo_total = df_['Z_CostContact'].sum()
    revenue = (df_[df_.Response == 1]['Z_Revenue'].sum())
    roi = str(round((revenue -
                 custo_total)/custo_total, 2)*100) + '%'
    success_rate = str(round(df_[df_.Response == 1].shape[0]/df_.shape[0], 2)*100) + '%'
    customers = df_.ID.nunique()
    

    return pd.DataFrame([[customers, df_[df_.Response == 1].ID.nunique(), success_rate, custo_total, revenue, revenue-custo_total, roi]], 
                        columns=['# total de clientes', '# que aceitaram', 'Pct. Adesão', 'Custo total', 'Receita', 'Lucro', 'ROI'], index=[index])

# EDA

## To look for:

* Profiling the respondents:
    * Education 
    * Income
    * Age
    * Time as costumer
    * Buying behavior
    * RFM score of respondents
    

* As we don't know the kind of campaign that the marketing team realize, we will focus on understanding which consumers are more likely to accept any campaign. If we knew the scope of the campaign we could have done a more specific investigation, for example, if the next campaign would focus on alcoholic beverages, then having this knowledge we could search for patterns and discover customers which are more likely to buy alcoholic products when they have discounted prices or special deals.

In [None]:
# Reading data
df = pd.read_csv('data/ml_project1_data.csv')

* Creating age column from Year_Birth 

In [None]:
# Age column
df['Age'] = today.year - df['Year_Birth']

In [None]:
# Identifying out of scope variables
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

plt.suptitle('Distribution of Z annotated variables')
sns.distplot(df['Z_CostContact'], ax=ax[0], kde=False, color=default_color)
sns.distplot(df['Z_Revenue'], ax=ax[1], kde=False, color=default_color)
plt.show()

## Z variables
* From the case .pdf file the Z variables are related to the campaign costs and success rates
* The campaign success rate and return of investment can be calculated through them

In [None]:
roi = round((df[df.Response==1]['Z_Revenue'].sum() - df['Z_CostContact'].sum())/df['Z_CostContact'].sum(), 2)
success_rate = round(df[df.Response==1].shape[0]/df.shape[0], 2)*100
print(f"Campaign success rate: {success_rate}%\nReturn on investment (ROI): {roi}%")

In [None]:
df.Response.value_counts()

In [None]:
# Non dummy variables like accepted campaign and response
non_dummy = ['Age', 'Income', 'Kidhome', 'Teenhome', 
              'Recency', 'MntWines', 'MntFruits',
              'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
              'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
              'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

In [None]:
corr_plot(df[non_dummy], figsize=(18, 7), cmap='coolwarm')
# Checking for sanity if any correlation above .9 threshold
_ = drop_highly_correlated(df[df.columns.difference(['ID', 'Year_Birth'])], threshold=.9)

* There were no variables with correlation above .9

## Duplicates

In [None]:
# Checking for customers with multiple rows
df['ID'].value_counts().sort_values(ascending=False).to_frame('Customer Frequency').head()

* The are no duplicates in this dataset

## Missing Values

In [None]:
df.isnull().sum()

* Only the income column has missing values:
    * Values will be imputed using the mean of their group represented by: ['Education', 'Marital_Status', 'AgeGroup']
    * Age category is divided as:
        * 20-30
        * 30-50
        * 50 >

## Outlier detection

* Using histograms and ecdf plots to identify possible outliers in numerical columns
* Checking possible outlier in categorical columns like Marital Status, Education

### Numeric outlier

In [None]:
plot_dist(df, 
          columns=['Teenhome', 'Kidhome', 'Recency', 'MntWines', 'MntFruits',
                       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
                       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
                       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Age', 'Income'], 
          figsize=(12, 4), color=default_color)

* Seems like most of data is skewed this is a point of concern if we are going to use K-means, the data must not be skewed.
* Age and Income columns have outliers clear seen by its ECDF plots

* We will not remove customers with outlier columns instead we are going to clip it, all customers are eligible to receive any kind of deals. 
* Transformations:
    Age will be clipped with upper = 80 which is the maximum age before outliers occur
    The Income outlier which seems to be > 600000 will be replace with the median income of the population    

In [None]:
# Clipping age to 80, no need for lower bound because min age=24
df['Age'] = df['Age'].clip(upper=80)
# Replacing by income by the median
df.loc[df['Income'] > 6e5, 'Income'] = np.nan

In [None]:
plot_dist(df, columns=['Age', 'Income'], figsize=(12, 4), color=default_color)

### Categorical outlier

In [None]:
df['Marital_Status'].value_counts().sort_values().plot(kind = 'barh', color=default_color, alpha=.95, figsize=(8, 5), fontsize=13)
plt.tight_layout()
plt.savefig('images/marti_outlier.png', dpi=300)
plt.show()

In [None]:
df['Education'].value_counts().sort_values().plot(kind = 'barh', color=default_color, alpha=.95, figsize=(8, 5), fontsize=13)
plt.tight_layout()
plt.show()

* Seems only the Marital Status columns has weird values such as Alone, Absurd, and Yolo.
* These values will be categorized as Other

In [None]:
# Other replace
df.loc[df['Marital_Status'].isin(['Alone',
       'Absurd', 'YOLO']), 'Marital_Status'] = 'Other'


In [None]:
df['Marital_Status'].value_counts().sort_values().plot(kind = 'barh', color=default_color, alpha=.95, figsize=(8, 5), fontsize=13)
plt.tight_layout()
plt.show()

* Notice that Other marital status has a lower frequency in the customer dataset


## Campaign success rate

In [None]:
f, ax = plt.subplots(1, 1, )

freq_campaign = df[['Response', 'AcceptedCmp3', 'AcceptedCmp4',
                    'AcceptedCmp5', 'AcceptedCmp1',
                    'AcceptedCmp2']].sum(axis=1). \
    value_counts().sort_values(ascending=False).to_frame('freq')

ax = freq_campaign.freq.plot(kind='bar', color=default_color, 
                        alpha=.95, fontsize=13, 
                        figsize=(8, 5))

plt.title('Number of campaingns accepted per customer')
plt.xlabel('Number of campaigns accepted')
plt.ylabel('Number of customers')

# Add this loop to add the annotations
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    perc = p.get_height()/df.shape[0]
    x, y = p.get_xy()
    ax.annotate(f'{perc:.02%}', (x + width/2, y + height*1.01), ha='center')
    

plt.xticks(rotation=0)
plt.ylim(0, 1800)

plt.tight_layout()
plt.show()

* From the plot above we can notice that:
 * ~73% of the customers didn't accept any campaign
 * ~27% of the customers accepted at least one campaign
 * And there is a few number of customers which accepted all of the campaigns, 10 customers to be exact

In [None]:
df_cmp = df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']].sum()
df_cmp.index = [f'{i}' for i in range(1, 7)]
df_cmp.sort_values('index')
ax = df_cmp.plot(kind='bar', color=default_color, 
                        alpha=.95, fontsize=13, 
                        figsize=(8, 5))

plt.title('Campaign success rate')
plt.ylabel('Number of customers')

# Add this loop to add the annotations
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    perc = p.get_height()/df.shape[0]
    x, y = p.get_xy()
    ax.annotate(f'{perc:.02%}', (x + width/2, y + height*1.01), ha='center')

plt.xlabel('Campaign')
plt.xticks(rotation=0)
plt.ylim(0, 370)
plt.tight_layout()
plt.show()

* From the plot above:
    * The last campaign was the most successful in terms of customer adhesion, doubling last campaigns success rate.
    * Campaign 2 was the worst one with 1.34% customer adhesion

## Profiling respondents

* In this section an analysis will be made to better understand the costumers who positively responded to the campaigns

In [None]:
df['accpt'] = "Não aceitou nenhuma campanha"

accpt_mask = df[['Response', 'AcceptedCmp3', 
                 'AcceptedCmp4', 'AcceptedCmp5', 
                 'AcceptedCmp1', 'AcceptedCmp2']].sum(axis=1) > 0

df.loc[accpt_mask, 'accpt'] = 'Aceitou pelo menos uma campanha'

In [None]:
accpt_bar_plot(df, 'Education', title='Education')

In [None]:
# Creating age groups
labels = ['20-40 Anos', '41-60 Anos', '60+ Anos']
bins = [20, 40, 60, 100]
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels,)
accpt_bar_plot(df, 'AgeGroup', xlabel='Faixa etária', ylim_offset=10)


In [None]:
# Tenure, i.e, time as consumer
df['Tenure'] = (today - pd.to_datetime(df.Dt_Customer)).dt.days/365
plot_dist(df, columns=['Tenure'], figsize=(12, 4), color=default_color)


* The time as costumer (tenure) is well spread across customers


In [None]:
# Creating age groups
labels = ['6-7 Anos', '7-9 Anos']
bins = [6, 7, 9]
df['TenureGroup'] = pd.cut(df['Tenure'], bins=bins, labels=labels,)
accpt_bar_plot(df, 'TenureGroup', xlabel='Tempo como consumidor', ylim_offset=10)

In [None]:
accpt_bar_plot(df, 'Kidhome', xlabel='Número de crianças em casa', ylim_offset=5)

In [None]:
accpt_bar_plot(df, 'Teenhome', xlabel='Número de adolescentes em casa', ylim_offset=5)

In [None]:
# Number of children at home
df['Number_Children'] = df['Kidhome'] + df['Teenhome']

In [None]:
accpt_bar_plot(df, 'Number_Children', xlabel='Number of children at home', ylim_offset=5)

In [None]:
accpt_bar_plot(df, 'Marital_Status', ylim_offset=5)


In [None]:
f, ax = plt.subplots(1, 2, figsize=(16, 5))

(df[df.Response == 1][['MntWines', 'MntFruits', 
                     'MntMeatProducts', 'MntFishProducts',
                     'MntSweetProducts', 'MntGoldProds']]\
                    .sum()/1e3).sort_values().plot(kind='barh', 
                                              color=default_color, 
                                              alpha=.95, 
                                              ax=ax[1])
ax[1].xaxis.set_tick_params(labelsize=14)
ax[1].yaxis.set_tick_params(labelsize=14)
ax[1].set_xlabel('Quantity in thousands')
ax[1].set_title('Quantity that respondents bought from a product category')

# Second plot

df[df.Response == 1][['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']]\
                .sum().sort_values().plot(kind='barh', 
                                          color=default_color, 
                                          alpha=.95, 
                                          ax=ax[0])
ax[0].xaxis.set_tick_params(labelsize=14)
ax[0].yaxis.set_tick_params(labelsize=14)
ax[0].set_xlabel('Quantity')
ax[0].set_title("Respondent's usage of different sales channels")
plt.tight_layout()
plt.savefig('images/buying_behavior.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plot_dist(df[df.Response==1], columns=['Income'], figsize=(12, 4), color=default_color)

In [None]:
df[df.Response==1]['Income'].agg([min, max, 'mean', 'median'])

In [None]:
# Adding var to non dummy variables
non_dummy.append('Tenure')

In [None]:
variation_df = df.groupby('accpt')[non_dummy].mean().T
variation_df.columns = ['mean_accpt', 'mean_non_accpt']

In [None]:
variation_df['mean_variation'] = ((variation_df['mean_accpt'] - variation_df['mean_non_accpt']) /
                variation_df['mean_non_accpt']).round(2).to_frame('Variation_Accpt_NonAccpt')\

variation_df = variation_df.drop(['Kidhome', 'Teenhome', 'Age', 'Recency', 'Tenure', 'NumWebVisitsMonth'])

In [None]:
variation_df


In [None]:
ax = variation_df['mean_variation'].plot(kind='barh', color=default_color, alpha=.95, figsize=(10, 7), legend=None)
ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)
#plt.title('Comparison between the means of respondents and non respondents', loc='left', x=-0.18049, y=1.005)
plt.title('Comparação entre as médias dos clientes positivos e negativos', loc='left', x=-0.18049, y=1.005)


plt.xlabel(r'Variação (%)')
plt.tight_layout()
plt.savefig('images/comparison_of_means.png', dpi=300, bbox_inches='tight')
plt.show()

### Insights over respondents:

* Looking over the education of respondents, most of them have higher education, is the campaign somehow discriminating?
* However most of the customer in the dataset have higher education, why is that?
* Education: majority of them have higher education 
* Income: average income of 60000
* Age: the biggest age group with respondent is 41-60 Years
* Time as costumer: majority of them have been a customer for at least 7 years 
* People at home: most of the respondents don't have children or teenagers at home
* Marital Status: majority is either married or living together
* Buying behavior: biggest sale channel is the physical one and the most bought product is wine
* RFM score of respondents: expected to be really high

# RFM Segmentation

* Recency, frequency, monetary, and tenure segmentation of customers
* Recency: days since last purchase
* Frequency: how many times has the customer used any of the sales channels
* Monetary: In this case we don't have the price spent so we are going to assume a price unit for every kind of product except for gold products which will have a price unit of 2

In [None]:
# Gold products have higher "monetary" value
df['MntGoldProds'] *= 2


In [None]:
df['Monetary'] = df[['MntWines', 'MntFruits', 
                    'MntMeatProducts', 'MntFishProducts', 
                    'MntSweetProducts', 'MntGoldProds']].sum(axis=1)

In [None]:
# +1 because if customer is in dataset he must have bought something at least one time
df['Frequency'] = 1 + df[['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)

## Checking for outliers

In [None]:
plot_dist(df, columns=['Recency', 'Frequency', 'Monetary'], figsize=(12, 4), color=default_color)

* No apparent outliers in the RFMT variables, so we continue with the segmentation

## Quantiles cutting
* RFM variables will cut in quartiles
* 1 - Low Score
* 4 - High Score

In [None]:
# Assigning scores to the variables
df['r_score'] = pd.qcut(df['Recency'], q=4, labels=range(4, 0, -1)).astype(int)
df['f_score'] = pd.qcut(df['Frequency'], q=4, labels=range(1, 5))
df['m_score'] = pd.qcut(df['Monetary'], q=4, labels=range(1, 5))

In [None]:
plot_dist(df, columns=['r_score', 'f_score', 'm_score'], figsize=(12, 4), color=default_color)

## Analyzing acceptance over RFM variables

In [None]:
f, ax = plt.subplots(1, 3, figsize=(16, 5), constrained_layout=True, sharey=True)
plt.grid(False)

accpt_bar_plot(df, 'r_score', title='Recency', xlabel='Score', ylim_offset=8, ax=ax[0])
accpt_bar_plot(df, 'f_score', title='Frequency', xlabel='Score', ylim_offset=8, ax=ax[1])
accpt_bar_plot(df, 'm_score', title='Monetary', xlabel='Score', ylim_offset=8, ax=ax[2])

plt.grid(True, axis='y')

plt.savefig('images/com_rfm.png', dpi=300, bbox_inches='tight')

In [None]:
accpt_bar_plot(df, 'r_score', title='Recency', xlabel='Score', ylim_offset=8)

In [None]:
accpt_bar_plot(df, 'f_score', title='Frequency', xlabel='Score', ylim_offset=8)

In [None]:
accpt_bar_plot(df, 'm_score', title='Monetary', xlabel='Score', ylim_offset=8)

* We can notice that higher scores tend to have higher acceptance. 
* Knowing this we could formulate a simple approach for the marketing team which uses these **RFM** scores.

## Segmentation using scores
* Usually at this stage we would define segments based on business knowledge and the sum of the RFM scores.
* A different approach using K-Means to find clusters will be followed.

## K-Means Clustering

* The data will be preprocessed so K-Means have a better chance to converge to an optimal solution.
* The number of clusters will be found using the elbow-method with the silhouette score as the metric of evaluation.
* An analysis of the clusters and how they behave will be done later on

### K-Means assumption and data preprocessing

* Data will be processed to satisfy K-Means assumptions which are:
    * Symmetric distributions of variables (not skewed)
    * Variables with same average values. This makes ensures that each metric receives an equal weight in the K-Means calculation.
    * Variables with same variance, this also ensures equal importance in the clustering calculation.

In [None]:
rfm_columns = ['Recency', 'Frequency', 'Monetary']

df_rfm = df.set_index('ID')[rfm_columns].copy()

In [None]:
# Before log transform plot
f, ax = plt.subplots(1, len(rfm_columns), figsize=(16, 4))

plt.suptitle('RFM distributions before skewness treatment', y=1.035)
for col, i in zip(rfm_columns, range(len(rfm_columns))):
    sns.distplot(df_rfm[col], ax=ax[i], color=default_color)
    
plt.tight_layout()



plt.show()

In [None]:
# Before log transform plot
f, ax = plt.subplots(1, len(rfm_columns), figsize=(16, 4))

plt.suptitle('RFM distributions before skewness treatment', y=1.035)
for col, i in zip(rfm_columns, range(len(rfm_columns))):
    sns.distplot(df_rfm[col], ax=ax[i], color=default_color)
    
plt.tight_layout()


plt.show()

In [None]:
rfm_columns = ['Recency', 'Frequency', 'Monetary']

df_rfm = df.set_index('ID')[rfm_columns].copy()

# Before log transform plot
f, ax = plt.subplots(2, len(rfm_columns), figsize=(16, 8))

plt.subplots_adjust(wspace=0.2, hspace=0.4) 

for col, i in zip(rfm_columns, range(len(rfm_columns))):
    sns.distplot(df_rfm[col], ax=ax[0][i], color=default_color)
    
# Scaling the variables so they have same avg and std values, only Recency because the other ones were powertransformed
#df_rfm[['Recency']] = StandardScaler().fit_transform(df_rfm[['Recency']])

# Skewennes treatment is to take the log (Values must be positive)
# Applying log + 1 (because of zeros), to treat skewness
df_rfm[['Frequency', 'Monetary', 'Recency']] = PowerTransformer(method='yeo-johnson').fit_transform(df_rfm[['Frequency', 'Monetary', 'Recency']])

for col, i in zip(rfm_columns, range(len(rfm_columns))):
    sns.distplot(df_rfm[col], ax=ax[1][i], color=default_color)

ax[0][0].text(134., ax[0][0].get_ylim()[1] + .001, 'Variáveis RFM antes do processamento', color='k',fontsize=20)
ax[1][0].text(3., .4, 'Variáveis RFM após o processamento', color='k',fontsize=20)
#plt.tight_layout()


#plt.savefig('images/tratamento_rfm.png', dpi=300)

* Frequency and Monetary variables distributions are skewed, this will be treated


In [None]:
# Skewennes treatment is to take the log (Values must be positive)
# Applying log + 1 (because of zeros), to treat skewness
df_rfm[['Frequency', 'Monetary', 'Recency']] = PowerTransformer(method='yeo-johnson').fit_transform(df_rfm[['Frequency', 'Monetary', 'Recency']])

#df_rfm[['Frequency', 'Monetary']].transform(lambda v: np.log(v))

# After log transform plot
f, ax = plt.subplots(1, len(rfm_columns), figsize=(16, 4))

plt.suptitle('RFM distributions after skewness treatment', y=1.035)
for col, i in zip(rfm_columns, range(len(rfm_columns))):
    sns.distplot(df_rfm[col], ax=ax[i], color=default_color)
    
plt.tight_layout()
plt.show()

In [None]:
df_rfm.skew()

In [None]:
# Scaling data so mean and variance of variables are the same
# Use StandardScaler
df_rfm.describe().round(3)

In [None]:
# Scaling the variables so they have same avg and std values, only Recency because the other ones were powertransformed
#df_rfm[['Recency']] = StandardScaler().fit_transform(df_rfm[['Recency']])

In [None]:
# Describe after scaling
# Variables have the same avg and std values
df_rfm.describe().round(5).append(df_rfm.skew().to_frame('Skewness').T)

### Finding K

* Using elbow-method to find the K number of clusters

In [None]:
# Instantiate the clustering model and visualizer
kmeans = KMeans(random_state=42)
visualizer = KElbowVisualizer(kmeans, k=(2, 10))


visualizer.fit(df_rfm[rfm_columns].values)        # Fit the data to the visualizer
ax = visualizer.show()        # Finalize and render the figure

* Using the elbow method the best K segmentations is 4

In [None]:
kmeans = KMeans(random_state=42, n_clusters=int(visualizer.elbow_value_))

In [None]:
df_rfm_copy = df_rfm.copy()

In [None]:
df_rfm = df_rfm_copy.copy()
df_rfm['Cluster'] = kmeans.fit_predict(df_rfm[rfm_columns].values)

df_rfm['Cluster'] = df_rfm['Cluster'].astype(int)
df_rfm = df_rfm.reset_index()

# Assigning labels to original dataframe
df['Cluster'] = df_rfm['Cluster']

In [None]:
# Silhouette score to see how well K-Means performed
s_score = silhouette_score(df_rfm[rfm_columns].values, df_rfm['Cluster'].values, random_state=42)
print(f'Silhoutte Score: {s_score}')

In [None]:
# Melt the normalized dataset and reset the index
df_melt = pd.melt(
                df_rfm, 
# Assign ID and Cluster as ID variables                  
                id_vars=['ID', 'Cluster'],

# Assign RFMT values as value variables
                value_vars=['Recency', 'Frequency', 'Monetary'], 
# Name the variable and value
                var_name='Metric', value_name='Value'
                )

df_piv = df_melt.pivot_table("Value", "Metric", "Cluster")
df_piv = df_piv.reindex(['Recency', 'Frequency', 'Monetary'])
ax = df_piv.plot(marker="o", figsize=(10, 4.8), colormap='seismic')

ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Snake plot das variáveis RFM')
plt.savefig('images/snake_plot.png', dpi=300)
plt.show()

In [None]:
# Calculate average RFM values for each cluster
cluster_avg = df.groupby(['Cluster'])[rfm_columns].mean() 

# Calculate average RFM values for the total customer population
population_avg = df[rfm_columns].mean()

# Calculate relative importance of cluster's attribute value compared to population
relative_imp = cluster_avg / population_avg - 1

# Initialize a plot with a figure size of 8 by 2 inches 
plt.figure(figsize=(8, 4))


# Add the plot title
plt.title('Relative importance of attributes')

# Plot the heatmap
ax = sns.heatmap(data=relative_imp, annot=True, cmap='coolwarm', linewidths=.5, cbar_kws={"shrink": 1}, fmt=".1%")
ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)

plt.show()

* Using the plot above we can see which RFM characteristics influence on the cluster score

### RFM Cluster characteristics

* With an graphical analysis using the Snake plot and the importance heatmap we have the following profiling of clusters:
    * **Lowest**: High Recency and low Frequency and Monetary values (Non active and non money spenders)
    * **Medium**: Low Recency, Frequency, and Monetary values (Active but are not money spenders)
    * **High**: High Recency, Frequency, and Monetary Value (Non active but are money spenders)
    * **Highest**: Low Recency and high Frequency and Monetary values (Active and money spenders) 

In [None]:
# Assigning each cluster to its respective profile
#df_rfm['Cluster'] = df_rfm['Cluster'].map({2: 'Alto', 1: 'Melhor', 3: 'Medio', 0: 'Pior'})
# Assigning each cluster to its respective profile
df_rfm['Cluster'] = df_rfm['Cluster'].map({3: 'Alto', 1: 'Melhor', 2: 'Medio', 0: 'Pior'})

In [None]:
# Same plot but with renaming of clusters
# Melt the normalized dataset and reset the index
df_melt = pd.melt(
                df_rfm, 
# Assign ID and Cluster as ID variables                  
                id_vars=['ID', 'Cluster'],

# Assign RFMT values as value variables
                value_vars=['Recency', 'Frequency', 'Monetary'], 
# Name the variable and value
                var_name='Metric', value_name='Value'
                )

df_piv = df_melt.pivot_table("Value", "Metric", "Cluster")
df_piv = df_piv.reindex(['Recency', 'Frequency', 'Monetary'])
ax = df_piv.plot(marker="o", figsize=(10, 5), colormap='seismic')

ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)
ax.legend(fontsize=15)
#plt.xlabel('Metric')
plt.ylabel('Valor')
plt.title('Snake plot das variáveis RFM')
#plt.tight_layout()
plt.savefig('images/snakeplot_rfm.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# For reassigning purposes after mapping
df['Cluster'] = df_rfm['Cluster']

In [None]:
# Assigning each cluster to its respective profile
#df['Cluster'] = df['Cluster'].map({0: 'High', 2: 'Highest', 1: 'Medium', 3: 'Lowest'})

In [None]:
cat_dtype = pd.api.types.CategoricalDtype(categories=['Pior', 'Medio', 'Alto', 'Melhor'], ordered=True)

In [None]:
df_rfm.Cluster = df_rfm['Cluster'].astype(cat_dtype)

In [None]:
df.Cluster = df['Cluster'].astype(cat_dtype)

In [None]:
accpt_bar_plot(df, 'Cluster', title='Aceitação por segmento RFM')

* From the plot above we can notice that clusters that holds the customers with higher RFM values tend to accept more campaigns than those with low RFM scores.
* Knowing this we could use this simple segmentation to choose possible customers from clusters High, and Highest to contact.

## Using a simple RFM segmentation in order to increase ROI

In [None]:
campaign_success(df, 'Piloto')\
.append(campaign_success(df[df.Cluster.isin(['Melhor', 'Alto'])], 'Somente segmentos RFM Alto e Melhor'))\
.append(campaign_success(df[df.Cluster.isin(['Melhor'])], 'Somente o Melhor segmento RFM'))

### RFM approach key takeaways 

* Using the RFM segmentation proposed as we select better customers the ROI and Success Rates tend to increase, however the number of customers decreases, i.e, the reach of the campaign decreases.
* It could be a safer and simpler approach to reduce campaign costs.
* With better business insights the RFM segmentation can improve.

# Predictive Model - Binary Classification
* This section is deserved to show the improvements of using a classifier over the simple RFM approach

* Section Divided in:
    * Data Cleaning and Feature Engineering
    * Feature Selection
    * Model Selection
    * Model Evaluation

## Data Cleaning and Feature Engineering

* Income variable will be inputed as discussed before.
* Variables that have and absolute skewness value >.4 will be transformed using yeo-johnson transform
* Variables that weren't transformed will be scaled using StandardScaler
* Logistic vs SVM
* To prevent data leakage a data preprocessing will be constructed and ran separately in the training and test set.
* To also prevent any leaks in information variables such as AcceptedCmp will be removed. Also, new customers or other customers may not have these information.

In [None]:
 df.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts().to_frame('Response Count').T

In [None]:
 df.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts()

In [None]:
df.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts().to_frame('Response Count').reset_index()

In [None]:
imb = round(df.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts().to_frame('Response Count')/df.shape[0], 2)*100

## ARRANJAR TITULO MELHOR PRA FIGURA

In [None]:
# Checking for imbalanced dataset
ax = imb.plot(kind='bar', figsize=(6, 4), legend=False)
plt.xticks(rotation=0)
plt.title('')
plt.ylabel('% de clientes')
ax.xaxis.set_tick_params(labelsize=15)
ax.yaxis.set_tick_params(labelsize=15)
plt.savefig('images/imb.png', dpi=300, bbox_inches='tight')

### Imbalanced Dataset

* Only 15% (last campaign's success rate) of the data is labeled with 1.
* To overcome this problem the data will be split in a stratified manner, so that the training and test sets have the same percentage of classes occurrences.
* Also an oversampling technique called SMOTE will be used during training.

In [None]:
# Columns that will be used to train the classification model
data_columns = ['Education', 'Marital_Status', 'Income', 
           'Kidhome', 'Teenhome', 'MntWines', 'MntFruits',
           'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
           'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 
           'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
           'Complain', 'Age', 'Tenure', 'Recency', 'Cluster', 'Response', 'AgeGroup', ]

In [None]:
df_clean = df.set_index('ID')[data_columns].copy()

In [None]:
# Enconding RFM clusters, greater the cluster number better this client is in terms of RFM
df_clean['Cluster'] = df_clean['Cluster'].map({'Medio': 2, 'Alto': 3, 'Melhor': 4, 'Pior': 1})

In [None]:
# Preserving these columns
# They will be used for inputing NA values for the variable income
# Before training they will be dropped and replaced by the dummy variables
df_clean['raw_Education'] = df_clean['Education']
df_clean['raw_Marital_Status'] = df_clean['Marital_Status']

In [None]:
# One hot encoding of categorical values
df_clean = pd.get_dummies(df_clean, columns=['Education', 'Marital_Status'])

In [None]:
# Dropping redudant columns created because of get_dummies
# Could have been any of the columns for each categorical columns
df_clean = df_clean.drop(['Marital_Status_Other', 'Education_2n Cycle'], axis=1)

In [None]:
def process_data(df, target='Response', skew_threshold=.3):
    df_proc = df.copy()
    #['Education', 'Marital_Status', 'AgeGroup']
    # inputing income
    # Tries higher granular level and if fails uses population median
    df_proc.loc[:, 'Income'] = df_proc['Income'].fillna(df_proc.groupby(['raw_Education', 'raw_Marital_Status', 'AgeGroup'])\
                        ['Income'].transform('median')).fillna(df_proc['Income'].median())
    
    # Dropping columns used for inputting income NaN
    df_proc = df_proc.drop(['raw_Education', 'raw_Marital_Status', 'AgeGroup'], axis=1)
    
    # Finding columns to power transform using yeo-johnson
    # And finding columns that will be scaled using StandardScaler
    skew = df_proc[df_proc.columns.difference([target])].skew()
    pt_columns = skew[np.abs(skew) > skew_threshold].index
    scale_columns = skew[np.abs(skew) < skew_threshold].index
    
    df_proc[pt_columns] = PowerTransformer(method='yeo-johnson').fit_transform(df_proc[pt_columns])
    df_proc[scale_columns] = StandardScaler().fit_transform(df_proc[scale_columns])
    
    print(f'Mean Skew value of DF: {df_proc.skew().mean()}\nMedian Skew value of DF: {df_proc.skew().median()}')
    display(df_proc.agg(['mean', 'std']).round(4))
    
    
    return df_proc[df_proc.columns.difference([target])], df_proc[target]

In [None]:
# Stratified split, because of imbalanced data
# Splitting data before any scaling or sampling to avoid data leakage
df_train, df_test = train_test_split(df_clean, test_size=.3, stratify=df_clean.Response.values, random_state=42)

In [None]:
(df_train.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts()/df_train.shape[0]).to_frame('Conjunto de Treino').T

In [None]:
(df_test.Response.replace({0: 'Não aceitou', 1: 'Aceitou'}).value_counts()/df_test.shape[0]).to_frame('Conjunto de Teste').T

In [None]:
X_train, y_train = process_data(df_train)

## Feature Selection

* Variables will be selected using a recursive approach.
* RFECV with logistic regression.
* RFECV: recursive feature elimination and cross-validated selection of the best number of features.
* To prevent any leakage this will be done using the training set.
* This helps to prevent overfitting and reduces dimensionality resulting in less time fit our estimator.

In [None]:
rfe_selector = RFECV(estimator=LogisticRegression(solver='liblinear'), step=2, cv=StratifiedKFold(n_splits=5), n_jobs=-1)
rfe_selector.fit(X_train, y_train)
rfe_support = rfe_selector.get_support()
rfe_feature = X_train.loc[:, rfe_support].columns.tolist()
print(f'Number of selected features: {len(rfe_feature)}')
rfe_feature

In [None]:
X_test, y_test = process_data(df_test)

In [None]:
# Features that were selected
rfe_feature

In [None]:
# Using only the selected features using RFECV
X_test_slc = X_test[rfe_feature]
X_train_slc = X_train[rfe_feature]

## Model Selection

* The model chose was LogistRegression because of its simplicity and efficiency with binary classification tasks.
* Using GridSearch greedy framework to find the best parameters of the estimator, i.e, hyperparameter tuning.
* The training is done using StratifiedKFold to prevent overfitting.
* Every fold is oversampled using SMOTE so they have the same number of classes.

In [None]:
# Models and param grids to use on GridSearch
models = [{'name': 'logreg','label': 'Logistic Regression',
           'classifier': LogisticRegression(random_state=42),
           'grid': {"clf__C": np.logspace(-3,3,7), "clf__penalty": ["l1","l2"], "clf__solver": ['liblinear'], 'clf__random_state': [42]}}]

In [None]:
# from sklearn.metrics import balanced_accuracy_score, average_precision_score, make_scorer

# scorer = make_scorer(balanced_accuracy_score)
# p_scorer = make_scorer(average_precision_score, average='weighted')

In [None]:
def model_selection(classifier, name, grid, X_train, y_train, X_test, y_test,
                    scoring, features_info='all_features', cv=StratifiedKFold(n_splits=5), n_jobs=-1):
    
    # Oversampling with smote
    smote = SMOTE(random_state=42)
    # Pipeline, every fold does as sampling
    pipeline = Pipeline([('sampling', smote), ('clf', classifier)])
    
    # GridSearch init
    gridsearch_cv=GridSearchCV(pipeline, 
                               grid,
                               cv=cv, 
                               scoring=scoring, 
                               n_jobs=n_jobs, 
                               verbose=2)
    
    gridsearch_cv.fit(X_train, y_train)
    
    # Creates the results dataframe
    results_dict = {}
    results_dict['classifier_name'] = name    
    results_dict['classifier'] = gridsearch_cv.best_estimator_
    results_dict['best_params'] = gridsearch_cv.best_params_
    results_dict['ROC_AUC_TRAIN'] = gridsearch_cv.best_score_
    results_dict['features_info'] = features_info
    results_dict['refit_time'] = gridsearch_cv.refit_time_
    
    # Compute the ROC_AUC score in the never seen test test
    y_pred = gridsearch_cv.best_estimator_.predict(X_test)
    results_dict['ROC_AUC_TEST'] = roc_auc_score(y_test, y_pred)
    
    return(results_dict)

results = []
for m in models:    
    results.append(model_selection(m['classifier'], 
                                   m['name'],
                                   m['grid'],
                                   X_train, 
                                   y_train,
                                   X_test,
                                   y_test,
                                   'roc_auc', 'all_features'))      
    
    results.append(model_selection(m['classifier'], 
                                   m['name'],
                                   m['grid'],
                                   X_train_slc, 
                                   y_train,
                                   X_test_slc,
                                   y_test,
                                   'roc_auc', 'selected_features'))   
    
results = pd.DataFrame.from_dict(results)
results

In [None]:
pd.set_option('display.max_colwidth', None)
results[['classifier_name', 'best_params', 'features_info', 'ROC_AUC_TEST', 'ROC_AUC_TRAIN', 'refit_time']].round(3)

* With less and more relevant features we have almost the same score.

In [None]:
# Getting the best estimator
best = results.sort_values(by=['ROC_AUC_TEST', 'ROC_AUC_TRAIN'], ascending=False).head(1)
clf = best.iloc[0, 1]['clf']
feature = best.iloc[0, 4]
display(clf)

In [None]:
X_test_final = X_test_slc if feature=='selected_features' else X_test

## Model Evaluation

In [None]:
# Predicting if a customer will accept or not the campaing
# This test has never been used
y_pred = clf.predict(X_test_final)

In [None]:
def report_to_df(report):
    report = [x.split(' ') for x in report.split('\n')]
    header = ['Resposta']+[x for x in report[0] if x!='']
    values = []
    for row in report[1:-5]:
        row = [value for value in row if value!='']
        if row!=[]:
            values.append(row)
    df = pd.DataFrame(data = values, columns = header)
    return df

In [None]:
report = report_to_df(classification_report(y_test, y_pred))
report['Resposta'] = report['Resposta'].replace({'0': 'Não aceitou', '1': 'Aceitou'})

In [None]:
report

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(clf, X_test_final, y_test)


In [None]:
# Confusion matrix
f, ax = plt.subplots(1, 1, figsize=(7, 5))
plot_confusion_matrix(clf, X_test_final, y_test, normalize='true', display_labels=["Não aceitou", 'Aceitou'], cmap='Blues', ax=ax, values_format='.1%', )
ax.set_title('Matriz de confusão do modelo final')
ax.xaxis.set_tick_params(labelsize=14)
ax.yaxis.set_tick_params(labelsize=14)
ax.set_ylabel('Real')
ax.set_xlabel('Previsto')
plt.grid(None)
plt.tight_layout()
plt.savefig('images/cmatrix_model.png', dpi=300, bbox_inches='tight')
plt.show()

* Model performance:
    * The model does a good job of predicting customers who would accept the campaign.
    * Correctly predicted that 82% accepted the campaign and 76.9% did not.
    * This model is good enough for a pilot run over the current approach.

### Evaluating ROI and Success Rate for test set

In [None]:
# Assigning pred
X_test['accepted'] = y_pred

In [None]:
# original dataframe containing customers which the model said they would accept
df_model_accepted = df[df.ID.isin(X_test[X_test['accepted'] == 1].index)].copy()

In [None]:
#campaign_success(df[df.ID.isin(df_test.index) & (df.Response == 1)], index='Modelo perfeito').append(
(campaign_success(df[df.ID.isin(df_test.index)], index='Piloto com todos clientes do conjunto de teste'))\
.append(campaign_success(df_model_accepted, index='Clientes selecionados pelo modelo'))

* If we used this simple linear model on the test set we would increase our ROI in 85% and have a 38% campaign success rate.

## Feature Importance using SHAP

In [None]:
pd.set_option("display.max_columns",None)
shap.initjs()

In [None]:
explainer = shap.LinearExplainer(clf, X_train_slc)

In [None]:
shap_values = explainer.shap_values(X_test_slc)

In [None]:
plt.figure()
shap.summary_plot(shap_values, X_test_slc)

* From the summary plot we have great insights on how the variables are affecting our model, for example:
    * Lower values o recency tend increase the chance predicting that a customer will accept the campaign
    * The longer the time as customer also increases the chances of predicting that it will accept
    * A relation on the number of catalog purchases and store purchases can be noticed, customers that have low purchases in store and high purchases in catalog may been more susceptible to campaigns.