In [None]:
# Data management
import pandas as pd

# Math and Stat modules
import numpy as np

# Data preprocessing and trasformation (ETL)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, FunctionTransformer, Binarizer, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

#Imputer
from sklearn.impute import SimpleImputer

#Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold, StratifiedKFold, RepeatedKFold, ShuffleSplit, StratifiedShuffleSplit, learning_curve, validation_curve, cross_validate
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Visualization
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.tree import export_graphviz
import seaborn as sns

In [None]:
dataset = pd.read_csv("data/marketing_campaign.csv", sep=";")

In [None]:
dataset.head(5)

In [None]:
dataset.info()

Droppiamo le 3 colonne che non ci servono:
- ID
- Z_CostContact
- Z_Revenue

In [None]:
dataset.drop(columns=['ID', 'Z_CostContact', 'Z_Revenue'], inplace=True)
dataset.info()

Prendo le Label

In [None]:
dataset_label = dataset['Response'].values
dataset.drop(columns=['Response'], inplace=True)

## Pipeline

In [None]:
dataset.hist(figsize=(22,42), layout=(10, 3))

In [None]:
class DateTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, startDate = pd.to_datetime("2015-01-01")):
        self.startDate = startDate
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = pd.to_datetime(X.iloc[:,0])
        X_new = pd.DataFrame((self.startDate - X_new).transform(lambda days: days.total_seconds()/(60*60*24*30)))
        return X_new

In [None]:
income_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

date_pipeline = Pipeline([
    ('transformer', DateTransformer()),
    ('scaler', StandardScaler())
])

In [None]:
robust_features = ['Year_Birth', 'NumWebVisitsMonth', 'NumWebPurchases', 'NumCatalogPurchases', 'NumDealsPurchases']
standard_features = ['Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumStorePurchases']

data_preprocessing = ColumnTransformer([
    ('robust', StandardScaler(), standard_features),
    ('cat', OneHotEncoder(), ['Education', 'Marital_Status']),
    ('standard', RobustScaler(), robust_features),
    ('income', income_pipeline, ['Income']),
    ('dateSubscription', date_pipeline, ["Dt_Customer"])
],
    remainder = 'passthrough'
)

In [None]:
feature_matrix = data_preprocessing.fit_transform(dataset)

Ricreo nomi colonne

In [None]:
# columns_name = robust_features
# for c in ['Education', 'Marital_Status']:
#     cat_inc_name = [c+f"_cat{i}" for i in range(1,len(dataset[c].unique()))]
#     columns_name.extend(cat_inc_name)
# columns_name.extend(standard_features)
# columns_name.extend(dataset.columns.difference(columns_name))
# columns_name

## Test and Training set

Prendo le label e elimino quella colonna dal dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, dataset_label, test_size = 0.3, random_state = 42)
X_train.shape, X_test.shape

In [None]:
perceptron = Perceptron()
perceptron.fit(X_train, y_train) # apprendo su training
predicted_test = perceptron.predict(X_test) # predico sul test
np.sum(predicted_test == y_test)/len(y_test)

In [None]:
p = Perceptron()
p_score = cross_val_score(p, X_train, y_train, cv = 5).mean()
p_score

In [None]:
# Ci dice per ogni record che prediction ha ricevuto quando faceva parte del validation set
#cross_val_predict(perceptron, X_train, y_train, cv = 5)

In [None]:
kf = KFold(n_splits=10)
kf_score = cross_val_score(p, X_train, y_train, cv = kf).mean()
kf_score

In [None]:
rkf = RepeatedKFold(n_splits=5, n_repeats=4)
rkf_score = cross_val_score(p, X_train, y_train, cv = rkf).mean()
rkf_score

In [None]:
spf = ShuffleSplit(n_splits = 10, test_size = 0.25)
spf_score = cross_val_score(p, X_train, y_train, cv = spf, n_jobs=-1).mean()
spf_score

Abbiamo un dataset molto sbilanciato. Provo con StratifiedKFold.

In [None]:
pd.DataFrame(dataset_label).value_counts()

In [None]:
skf = StratifiedKFold(n_splits=10)
skf_score = cross_val_score(p, X_train, y_train, cv = skf, n_jobs=-1).mean()
skf_score

In [None]:
ssf = StratifiedShuffleSplit(n_splits=10)
ssf_score = cross_val_score(p, X_train, y_train, cv = ssf, n_jobs=-1).mean()
ssf_score

Provo a confrontare con dei DummyClassifier il Perceptron.

In [None]:
mf_dum_cls = DummyClassifier(strategy='most_frequent')
uni_dum_cls = DummyClassifier(strategy='uniform')
st_dum_cls = DummyClassifier(strategy='stratified')

mf_score = cross_val_score(mf_dum_cls, X_train, y_train, cv=10, scoring='accuracy').mean()
uni_score = cross_val_score(uni_dum_cls, X_train, y_train, cv=10, scoring='accuracy').mean()
st_score = cross_val_score(st_dum_cls, X_train, y_train, cv=10, scoring='accuracy').mean()

print("---------- Dummy Classifiers ----------\n")
print("Most frequent score:", mf_score)
print("Uniform score:", uni_score)
print("Stratified score:", st_score)
print("\n---------- Serious Classifiers ----------\n")
print("Perceptron score:", p_score)
print("KFold score:", kf_score)
print("Repeated KFold score:", rkf_score)
print("Shuffle Split score:", spf_score)
print("Stratified KFold score:", skf_score)
print("Stratified Shuffle Split score:", ssf_score)

Analizziamo le performance con matrice di confusione, precison, recall e f1.

In [None]:
y_train_predicted = cross_val_predict(p, X_train, y_train, cv = 10)
cm = confusion_matrix(y_train, y_train_predicted)
cm_display = ConfusionMatrixDisplay(cm).plot()

Vediamo le performance del perceptron confrontate ai dummy classifier.

In [None]:
y_train_mf = cross_val_predict(mf_dum_cls, X_train, y_train, cv = 10)
y_train_un = cross_val_predict(uni_dum_cls, X_train, y_train, cv = 10)
y_train_st = cross_val_predict(st_dum_cls, X_train, y_train, cv = 10)

print("------------ Perceptron ------------", "\nPrecision:", precision_score(y_train, y_train_predicted), "\nRecall:", recall_score(y_train, y_train_predicted), "\nf1:", f1_score(y_train, y_train_predicted))
print("\n------------ Most frequent ------------", "\nPrecision:", precision_score(y_train, y_train_mf), "\nRecall:", recall_score(y_train, y_train_mf), "\nf1:", f1_score(y_train, y_train_mf))
print("\n------------ Uniform ------------", "\nPrecision:", precision_score(y_train, y_train_un), "\nRecall:", recall_score(y_train, y_train_un), "\nf1:", f1_score(y_train, y_train_un))
print("\n------------ Stratified ------------", "\nPrecision:", precision_score(y_train, y_train_st), "\nRecall:", recall_score(y_train, y_train_st), "\nf1:", f1_score(y_train, y_train_st))

In [None]:
# Metti max_iter=100 e guarda link su preprocessing
logit_cls = LogisticRegression(max_iter=100)
y_scores = cross_val_predict(logit_cls, X_train, y_train, cv = 5, method='decision_function')

In [None]:
prec, recall, soglia = precision_recall_curve(y_train, y_scores)

In [None]:
fig_prc = plt.figure(figsize=(16,9))
ax = fig_prc.add_subplot()
ax.plot(soglia, prec[:-1], 'r', label = 'precision')
ax.plot(soglia, recall[:-1], 'b', label = 'recall')
ax.legend(fontsize=20)

In [None]:
fig_prf = plt.figure(figsize=(16,12))
ax = fig_prf.add_subplot()
ax.plot(recall[:-1], prec[:-1], 'r', label = 'precision', lw = 4)
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")

In [None]:
soglia_prec_90 = soglia[np.argmax(prec >= 0.5)]
y_predicted_score = y_scores >= soglia_prec_90
precision_score(y_train, y_predicted_score), recall_score(y_train, y_predicted_score), f1_score(y_train, y_predicted_score)

In [None]:
fpr, recall, soglia = roc_curve(y_train, y_scores)

fig_roc = plt.figure(figsize=(16,12))
ax = fig_roc.add_subplot()
ax.plot(fpr, recall, 'r', label = 'recall', lw = 7)
ax.set_xlabel("FPR")
ax.set_ylabel("Recall")

Learning curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(logit_cls,
                                                       X=feature_matrix,
                                                       y=dataset_label,
                                                       train_sizes= [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                       cv = 10,
                                                       n_jobs = -1,
                                                       shuffle = True)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig=plt.figure(figsize=(12,7))
ax = fig.add_subplot()
ax.plot(train_sizes, train_scores[:,0],
         color='blue', marker='o',
         markersize=5, label='Training accuracy - fold 1')

ax.plot(train_sizes, train_mean,
         color='blue', marker='+',
         markersize=5, label='Training accuracy')

ax.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

ax.plot(train_sizes, test_scores[:,0],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy - fold 1')

ax.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='d', markersize=5,
         label='Validation accuracy')

ax.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

ax.grid()
ax.set_xlabel('Dimensione del training set')
ax.set_ylabel('Accuracy')
ax.legend(loc='lower right')
ax.set_ylim([0.6, 1.03])

In [None]:
range_C = [0.001,0.01,0.1,1,10,100]
train_scores, test_scores = validation_curve(logit_cls, X=feature_matrix, y=dataset_label, param_range=range_C, param_name='C',cv=10, n_jobs=-1)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig=plt.figure(figsize=(12,7))
ax = fig.add_subplot()
ax.plot(range_C, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')

ax.fill_between(range_C,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

ax.plot(range_C, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')

ax.fill_between(range_C,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

ax.grid()
ax.set_xlabel('Parametro C')
ax.set_ylabel('Accuracy')
ax.legend(loc='lower right')
ax.set_ylim([0.8, 0.9])
ax.set_xlim([0., 1.03])

## Support Vector Machine

In [None]:
Cs = [0.01, 0.1, 1, 10, 100]# definire un insieme di valori di C tenendo in considerazione le precedenti osservazioni sul suo effetto 
fig = plt.figure(figsize=(18,3.2))
for i, c in enumerate(Cs):
    print('Training SVM per C =', c, i)
    svm_cls = LinearSVC(C = c, max_iter=50000)
    train_sizes, train_scores, test_scores = learning_curve(svm_cls, X = feature_matrix, y = dataset_label, train_sizes=np.linspace(0.1,1,10), cv = 5, n_jobs=-1, shuffle = True)
    

    print('Training per {} finito'.format(c))
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    ax = fig.add_subplot(150+(i+1))
    ax.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')
    ax.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_xlabel('Dimensione del training set')
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')

In [None]:
gamma1, gamma2 = 0.1, 2
C1, C2 = 0.01, 5
hyperparams = (gamma1, C1), (gamma1, C2), (gamma2, C1), (gamma2, C2)

train_sizes, train_means, test_means, test_stds, train_stds = [],[],[],[],[]
for gamma, C in hyperparams:
    rbf_kernel_svm_clf = SVC(kernel="rbf", gamma = gamma, C = C)
    train_size, train_scores, test_scores = learning_curve(rbf_kernel_svm_clf,
                                                       X=feature_matrix,
                                                       y=dataset_label,
                                                       train_sizes=np.linspace(0.1,1.0,10),
                                                       cv=5,
                                                       n_jobs=-1)
    print('fatto {},{}'.format(gamma,C))
    train_means.append(np.mean(train_scores, axis=1))
    train_stds.append(np.std(train_scores, axis=1))
    test_means.append(np.mean(test_scores, axis=1))
    test_stds.append(np.std(test_scores, axis=1))
    train_sizes.append(train_size)

In [None]:
fig= plt.figure(figsize=(12, 8))
for i in range(4):
    ax = fig.add_subplot(221+i)
    ax.plot(train_sizes[i], train_means[i],
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes[i],
                 train_means[i] + train_stds[i],
                 train_means[i] - train_stds[i],
                 alpha=0.15, color='blue')
    ax.plot(train_sizes[i], test_means[i],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes[i],
                 test_means[i] + test_stds[i],
                 test_means[i] - test_stds[i],
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')
    ax.set_title(r"$\gamma={}, C={}$".format(*hyperparams[i]), fontsize=18)

## Alberi di decisione

In [None]:
min_leaf = [5, 10, 100, 200, 350]

train_sizes, train_means, test_means, test_stds, train_stds = [],[],[],[],[]
for mlf in min_leaf:
    dt_mlf = DecisionTreeClassifier(min_samples_leaf=mlf, random_state=42, max_depth=15)
    train_size, train_scores, test_scores = learning_curve(dt_mlf,
                                                       X=feature_matrix,
                                                       y=dataset_label,
                                                       train_sizes=np.linspace(0.1,1.0,10),
                                                       cv=10,
                                                       n_jobs=-1)
    print('fatto {}'.format(mlf))
    train_means.append(np.mean(train_scores, axis=1))
    train_stds.append(np.std(train_scores, axis=1))
    test_means.append(np.mean(test_scores, axis=1))
    test_stds.append(np.std(test_scores, axis=1))
    train_sizes.append(train_size)

In [None]:
fig= plt.figure(figsize=(12, 8))
for i in range(5):
    ax = fig.add_subplot(231+i)
    ax.plot(train_sizes[i], train_means[i],
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes[i],
                 train_means[i] + train_stds[i],
                 train_means[i] - train_stds[i],
                 alpha=0.15, color='blue')
    ax.plot(train_sizes[i], test_means[i],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes[i],
                 test_means[i] + test_stds[i],
                 test_means[i] - test_stds[i],
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')
    ax.set_title(r"min_sam_leaf:{}".format(min_leaf[i]), fontsize=18)

## Ensemble methods

In [None]:
def visualize_ensemble_performance(ax, X, y, scores, scoring):
    data_score = []
    labels = []
    for clf in scores['estimator'][0].estimators_:
        scores_clf = cross_validate(clf, X, y,
                        cv = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=42),
                        return_train_score= True,
                        scoring = scoring,
                        n_jobs=-1)
        data_score.extend([scores_clf[t+s] for s in scoring for t in ['train_','test_']])
        labels.extend([clf.__class__.__name__+'_'+t+s for s in scoring for t in ['train_','test_']])
    data_score.extend([scores[t+s] for s in scoring for t in ['train_','test_']])
    labels.extend(['Voting_'+t+s for s in scoring for t in ['train_','test_']])
    sns.boxplot(ax = ax,
                data = data_score,
                whis = [5, 95],
                palette = "vlag",
                orient = 'h'
               )
    ax.set(yticklabels=labels)

In [None]:
log_clf = LogisticRegression(random_state=42)
svm_clf = SVC(random_state=42)
per_clf = Perceptron(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('pc', per_clf), ('svc', svm_clf)],
    voting='hard')

In [None]:
scores = cross_validate(voting_clf, feature_matrix, dataset_label,
                        cv = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=42),
                        return_estimator = True,
                        return_train_score= True,
                        scoring = ['recall','accuracy','f1'],
                        n_jobs=-1)

In [None]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot()
visualize_ensemble_performance(ax, feature_matrix, dataset_label, scores, ['recall','accuracy'])

In [None]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500, max_samples=200, bootstrap=True, n_jobs=-1)
dt_clf = DecisionTreeClassifier(min_samples_leaf=100, random_state=42, max_depth=15)

In [None]:
train_sizes, train_means, test_means, test_stds, train_stds = [],[],[],[],[]
for clf in [bag_clf, dt_clf]:
    train_size, train_scores, test_scores = learning_curve(clf,
                                                       X=feature_matrix,
                                                       y=dataset_label,
                                                       train_sizes=np.linspace(0.1,1.0,10),
                                                       cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42),
                                                       n_jobs=-1)
    print('fatto {}'.format(clf))
    train_means.append(np.mean(train_scores, axis=1))
    train_stds.append(np.std(train_scores, axis=1))
    test_means.append(np.mean(test_scores, axis=1))
    test_stds.append(np.std(test_scores, axis=1))
    train_sizes.append(train_size)

In [None]:
fig= plt.figure(figsize=(12, 8))
for i in range(2):
    ax = fig.add_subplot(121+i)
    ax.plot(train_sizes[i], train_means[i],
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes[i],
                 train_means[i] + train_stds[i],
                 train_means[i] - train_stds[i],
                 alpha=0.15, color='blue')
    ax.plot(train_sizes[i], test_means[i],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes[i],
                 test_means[i] + test_stds[i],
                 test_means[i] - test_stds[i],
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')

## Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, dataset_label, test_size=0.2, stratify=dataset_label)
len(y_train[y_train == 1])/len(y_train), len(y_test[y_test == 1])/len(y_test)

In [None]:
rnf_clf = RandomForestClassifier(n_estimators=250, max_leaf_nodes=64, n_jobs=-1, max_features=10)
et_clf = ExtraTreesClassifier(n_estimators=250, max_leaf_nodes=64, n_jobs=-1, max_features=10)
scores_rnf = cross_val_score(rnf_clf, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
scores_et = cross_val_score(et_clf, X_train, y_train, cv=5, scoring='f1',n_jobs=-1)

In [None]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
sns.boxplot(ax = ax,
            data = [scores_rnf, scores_et],
            palette = 'vlag',
            orient = 'h'
           )
ax.set(yticklabels=['RF','ET'], xlabel="f1")

In [None]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=6),
    n_estimators=250,
    algorithm='SAMME.R',
    learning_rate=0.5
)
scores_ada = cross_val_score(ada_clf, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)

In [None]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
sns.boxplot(ax = ax,
            data = [scores_rnf, scores_et, scores_ada],
            palette = 'vlag',
            orient = 'h'
           )
ax.set(yticklabels=['RF','ET','ADA'])