In [2]:
run load_data.py

### Evaluating Model Pipelines

We will evaluate a total of 24 model pipelines:

1. the original data
1. the original data with outliers removed
1. the original data transformed by a PCA with 2 components 
1. the original data with outliers removed transformed by a PCA with 2 components 
1. the original data transformed by a PCA with 3 components 
1. the original data with outliers removed transformed by a PCA with 3 components  
1. scaled data
1. scaled data with outliers removed
1. scaled data transformed by a PCA with 2 components 
1. scaled data with outliers removed transformed by a PCA with 2 components
1. scaled data transformed by a PCA with 3 components  
1. scaled data with outliers removed transformed by a PCA with 3 components  
1. log transformed, scaled data
1. log transformed, scaled data with outliers removed
1. log transformed, scaled data transformed by a PCA with 2 components
1. log transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. log transformed, scaled data transformed by a PCA with 3 components  
1. log transformed, scaled data with outliers removed transformed by a PCA with 3 components  
1. box-cox transformed, scaled data
1. box-cox transformed, scaled data with outliers removed
1. box-cox transformed, scaled data transformed by a PCA with 2 components 
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. box-cox transformed, scaled data transformed by a PCA with 3 components  
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 3 components  

### Experiment Design

We will pass each of these transformed data sets to a Gaussian Mixture Model and then assess the model using the BIC.

In [3]:
from sklearn.mixture import GaussianMixture # clustering model. All 6 features gets fed into Gaussian model. 

In [4]:
original_data = [
    ('original', customer_features),
    ('original - no outliers', customer_features_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2),
    ('original - pca, 3 components', customer_features_pca_3),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3)
]

scaled_data = [
    ('scaled', customer_sc),
    ('scaled - no outliers', customer_sc_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2),
    ('scaled - pca, 3 components', customer_sc_pca_3),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3),
]

log_transformed_data = [
    ('log transformed, scaled', customer_log_sc),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3),
]

box_cox_transformed_data = [
    ('box-cox transformed, scaled', customer_box_cox_sc),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3),
]

In [17]:
original_data

[('original', array([[12669,  9656,  7561,   214,  2674,  1338],
         [ 7057,  9810,  9568,  1762,  3293,  1776],
         [ 6353,  8808,  7684,  2405,  3516,  7844],
         ..., 
         [14531, 15488, 30243,   437, 14841,  1867],
         [10290,  1981,  2232,  1038,   168,  2125],
         [ 2787,  1698,  2510,    65,   477,    52]])),
 ('original - no outliers', array([[12669,  9656,  7561,   214,  2674,  1338],
         [ 7057,  9810,  9568,  1762,  3293,  1776],
         [ 6353,  8808,  7684,  2405,  3516,  7844],
         ..., 
         [39228,  1431,   764,  4510,    93,  2346],
         [10290,  1981,  2232,  1038,   168,  2125],
         [ 2787,  1698,  2510,    65,   477,    52]])),
 ('original - pca, 2 components', array([[  6.50022122e+02,   1.58551909e+03],
         [ -4.42680498e+03,   4.04245151e+03],
         [ -4.84199871e+03,   2.57876218e+03],
         [  9.90346437e+02,  -6.27980600e+03],
         [  1.06579987e+04,  -2.15972582e+03],
         [ -2.76596159e

In [5]:
def fit_and_score(data, n_components=2): #bic is in Gausian model. 
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    return model.bic(data)

In [6]:
n = 2

results_2_clusters = []

for name, data in original_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })

for name, data in scaled_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in log_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })


In [34]:
pd.DataFrame(results_2_clusters).sort_values('BIC')

Unnamed: 0,BIC,n,name
10,1885.492263,2,"scaled - no outliers, pca, 2 components"
11,2392.132209,2,"scaled - no outliers, pca, 3 components"
7,2448.488723,2,scaled - no outliers
8,2586.635476,2,"scaled - pca, 2 components"
16,3042.141635,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,3135.999109,2,"box-cox transformed, scaled - no outliers, pca..."
14,3154.867206,2,"log transformed, scaled - pca, 2 components"
20,3185.304232,2,"box-cox transformed, scaled - pca, 2 components"
9,3415.356073,2,"scaled - pca, 3 components"
17,4060.888354,2,"log transformed, scaled - no outliers, pca, 3 ..."


In [8]:
n = 3

results_3_clusters = []

for name, data in original_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })

for name, data in scaled_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in log_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })



In [9]:
pd.DataFrame(results_3_clusters).sort_values('BIC')

Unnamed: 0,BIC,n,name
10,1727.030687,3,"scaled - no outliers, pca, 2 components"
11,2257.567486,3,"scaled - no outliers, pca, 3 components"
7,2260.810283,3,scaled - no outliers
8,2474.559794,3,"scaled - pca, 2 components"
16,3078.041671,3,"log transformed, scaled - no outliers, pca, 2 ..."
22,3152.035867,3,"box-cox transformed, scaled - no outliers, pca..."
9,3152.932549,3,"scaled - pca, 3 components"
14,3158.495694,3,"log transformed, scaled - pca, 2 components"
20,3193.482551,3,"box-cox transformed, scaled - pca, 2 components"
6,3925.28339,3,scaled


### One More Thing ... What About Those Labels?

In [24]:
channel = customers.Channel.astype(int) - 1
# region = customers.Region
channel

0      1
1      1
2      1
3      0
4      1
5      1
6      1
7      1
8      0
9      1
10     1
11     1
12     1
13     1
14     1
15     0
16     1
17     0
18     1
19     0
20     1
21     0
22     0
23     1
24     1
25     1
26     0
27     0
28     1
29     0
      ..
410    0
411    0
412    0
413    0
414    0
415    1
416    1
417    0
418    1
419    0
420    0
421    1
422    0
423    1
424    1
425    0
426    0
427    0
428    0
429    0
430    0
431    0
432    0
433    0
434    0
435    0
436    0
437    1
438    0
439    0
Name: Channel, dtype: int64

In [31]:
from sklearn.metrics import accuracy_score

def fit_and_score_predictions(data, labels, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    predictions = model.predict(data)
    labels_pos = labels
    labels_neg = (labels == 0).astype(int) #boolings so need to convert to integer. 
    return max(accuracy_score(labels_pos, predictions), accuracy_score(labels_neg, predictions))

display((channel == 0).astype(int))
display(channel)

0      0
1      0
2      0
3      1
4      0
5      0
6      0
7      0
8      1
9      0
10     0
11     0
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     0
24     0
25     0
26     1
27     1
28     0
29     1
      ..
410    1
411    1
412    1
413    1
414    1
415    0
416    0
417    1
418    0
419    1
420    1
421    0
422    1
423    0
424    0
425    1
426    1
427    1
428    1
429    1
430    1
431    1
432    1
433    1
434    1
435    1
436    1
437    0
438    1
439    1
Name: Channel, dtype: int64

0      1
1      1
2      1
3      0
4      1
5      1
6      1
7      1
8      0
9      1
10     1
11     1
12     1
13     1
14     1
15     0
16     1
17     0
18     1
19     0
20     1
21     0
22     0
23     1
24     1
25     1
26     0
27     0
28     1
29     0
      ..
410    0
411    0
412    0
413    0
414    0
415    1
416    1
417    0
418    1
419    0
420    0
421    1
422    0
423    1
424    1
425    0
426    0
427    0
428    0
429    0
430    0
431    0
432    0
433    0
434    0
435    0
436    0
437    1
438    0
439    0
Name: Channel, dtype: int64

In [12]:
customer_sc_outliers_removed.shape

(399, 6)

In [13]:
original_data_with_labels = [
    ('original', customer_features, channel),
    ('original - no outliers', customer_features_outliers_removed, channel_original_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2, channel),
    ('original - pca, 3 components', customer_features_pca_3, channel),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2, channel_original_outliers_removed),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3, channel_original_outliers_removed)
]

scaled_data_with_labels = [
    ('scaled', customer_sc, channel),
    ('scaled - no outliers', customer_sc_outliers_removed, channel_scaled_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2, channel),
    ('scaled - pca, 3 components', customer_sc_pca_3, channel),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2, channel_scaled_outliers_removed),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3, channel_scaled_outliers_removed),
]

log_transformed_data_with_labels = [
    ('log transformed, scaled', customer_log_sc, channel),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed, channel_log_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2, channel),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3, channel),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2, channel_log_outliers_removed),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3, channel_log_outliers_removed),
]

box_cox_transformed_data_with_labels = [
    ('box-cox transformed, scaled', customer_box_cox_sc, channel),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2, channel),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3, channel),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3, channel_box_cox_outliers_removed),
]

In [14]:
n = 2

results_2_accuracy = []

for name, data, label in original_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })

for name, data, label in scaled_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in log_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in box_cox_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })



In [15]:
pd.DataFrame(results_2_accuracy).sort_values('accuracy', ascending=False)

Unnamed: 0,accuracy,n,name
12,0.895455,2,"log transformed, scaled"
13,0.891954,2,"log transformed, scaled - no outliers"
18,0.890909,2,"box-cox transformed, scaled"
19,0.890411,2,"box-cox transformed, scaled - no outliers"
23,0.881279,2,"box-cox transformed, scaled - no outliers, pca..."
16,0.878161,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,0.876712,2,"box-cox transformed, scaled - no outliers, pca..."
21,0.872727,2,"box-cox transformed, scaled - pca, 3 components"
14,0.870455,2,"log transformed, scaled - pca, 2 components"
20,0.870455,2,"box-cox transformed, scaled - pca, 2 components"


![](complex_pipe_1.png)

![](complex_pipe_2.png)

![](complex_pipe_3.png)

![](complex_pipe_4.png)

![](complex_pipe_5.png)
