## 10.2 Segmentation data

In [None]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

In [None]:
import pandas as pd
seg_df = pd.read_csv('http://bit.ly/PMR-ch5')
seg_df['is_female'] = seg_df.gender == 'female'
seg_sub = seg_df.drop(['Segment', 'gender'], axis=1)
seg_sub.head()

## 11.1 Classification

In [None]:
import numpy as np

seg_labels = seg_df.Segment

np.random.seed(537)
rand_idx = np.random.rand(seg_labels.shape[0])

train_idx = rand_idx <= 0.7
test_idx = rand_idx > 0.7

X_train = seg_sub.iloc[train_idx]
X_test = seg_sub.iloc[test_idx]

y_train = seg_labels.iloc[train_idx]
y_test = seg_labels.iloc[test_idx]

### 11.1.1 Naive bayes

In [None]:
from sklearn import naive_bayes 

nb = naive_bayes.GaussianNB()

nb.fit(X_train, y_train)

list(zip(nb.classes_, nb.class_prior_))

In [None]:
predictions = nb.predict(seg_sub)
seg_sub_pred = seg_sub.copy()
seg_sub_pred['prediction'] = predictions
seg_sub_pred['true_segment'] = seg_df['Segment']
seg_sub_pred.sample(5)

In [None]:
nb.score(X_test, y_test)

In [None]:
from sklearn import metrics

y_pred = nb.predict(X_test)

metrics.f1_score(y_true=y_test, y_pred=y_pred, average='weighted')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def confusion_matrix(y_true, y_pred, model):
  conf_mat = metrics.confusion_matrix(y_true, y_pred)

  sns.heatmap(conf_mat.T,
              xticklabels=model.classes_, yticklabels=model.classes_,
              annot=True, fmt='d')
  plt.xlabel('true label')
  plt.ylabel('predicted label')

In [None]:
confusion_matrix(y_test, y_pred, nb)

In [None]:
def return_precision_recall(y_true, y_pred, model):
  conf_mat = metrics.confusion_matrix(y_true, y_pred)

  precision = pd.Series(metrics.precision_score(y_test,
                                                y_pred,
                                                average=None),
                        index=model.classes_)
  recall = pd.Series(metrics.recall_score(y_test,
                                          y_pred,
                                          average=None),
                     index=model.classes_)
  
  f1 = pd.Series(2 * (precision * recall)/(precision + recall),
                 index=model.classes_)

  return pd.DataFrame([precision, recall, f1], index=['precision',
                                                      'recall', 'f1'])

In [None]:
return_precision_recall(y_test, y_pred, nb)

In [None]:
from sklearn import clone, decomposition

def plot_decision_pca(model, X, y):
  width, height = 500, 500

  # Transform the X values using a PCA
  p = decomposition.PCA(random_state=132, svd_solver='full')
  X_transformed = p.fit_transform(X.iloc[:,:2])

  # Pull the first two dimensions
  x0 = X_transformed[:, 0]
  x1 = X_transformed[:, 1]

  # Get evenly spaced values between the min and max values
  x0_g = np.linspace(x0.min(), x0.max(), width)
  x1_g = np.linspace(x1.min(), x1.max(), height)

  # Create a "grid" of those evenly spaced values from each vector
  xx, yy = np.meshgrid(x0_g, x1_g)

  # Stack together all of the sampled values 
  X_grid_transformed = np.vstack([xx.ravel(), yy.ravel()]).T

  # Do the inverse transform to get the non-PCA transformed values
  X_grid = p.inverse_transform(X_grid_transformed)

  # Fit a clone of the model using use inverse transformed columns
  # From the first two PCA dimensions.
  # Predict values on the sampled values
  model_c = clone(model)
  model_c.fit(p.inverse_transform(np.vstack([x0, x1]).T), y)
  X_grid_labels = model_c.predict(X_grid)

  # Create a class mapper to map from class string to an integer
  class_mapper = {class_:i for i,class_ in enumerate(model.classes_)}

  plt.figure(figsize=(6,6))
  # Plot the predicted values
  a = plt.scatter(x0, x1,
                  c=[class_mapper[label] for label in y],
                  cmap=plt.cm.rainbow, edgecolor='k', vmin=0, vmax=3)
  plt.contourf(xx, yy,
               np.reshape([class_mapper[label]
                           for label in X_grid_labels],
                          (width, height)),
              cmap=a.cmap, alpha=0.5, levels=3)
  cb = plt.colorbar(ticks=[0.5, 1.2, 2, 2.8])
  _ = cb.ax.set_yticklabels(model.classes_)
  plt.title('Decision boundaries with true values overlaid')
  plt.xlabel('First principal component')
  plt.ylabel('Second principal component')

In [None]:
plot_decision_pca(nb, X_test, y_test)

In [None]:
np.linspace(0, 10, 3)

In [None]:
np.linspace(0, 10, 15)

In [None]:
x = np.linspace(0, 10, 5)
y = np.linspace(5, 25, 5)

In [None]:
xx, yy = np.meshgrid(x, y)

In [None]:
xx

In [None]:
yy

In [None]:
xx.ravel()

In [None]:
np.vstack([xx.ravel(), yy.ravel()]).T

In [None]:
!pip install python_marketing_research
from python_marketing_research_functions import chapter10

chapter10.check_clusters(seg_sub, nb.predict(seg_sub))

In [None]:
chapter10.check_clusters(seg_sub, seg_labels)

In [None]:
pd.DataFrame(nb.predict_proba(seg_sub),
             columns=nb.classes_).sample(5).round(4)

### 11.1.2 Random forest classification

In [None]:
from sklearn import ensemble

np.random.seed(23432)
rf = ensemble.RandomForestClassifier(n_estimators=50)

rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
y_pred = rf.predict(X_test)

metrics.f1_score(y_test, y_pred, average='weighted')

In [None]:
confusion_matrix(y_test, y_pred, rf)

In [None]:
return_precision_recall(y_test, y_pred, rf)

In [None]:
import graphviz
from sklearn import tree
from IPython.display import Image


tree_0 = rf.estimators_[0]
dot_data = tree.export_graphviz(tree_0, out_file=None,
                                feature_names=X_train.columns,
                                class_names=rf.classes_)
tree_graph = graphviz.Source(dot_data, format='png')
tree_graph.render('tmp', view=True)
Image('tmp.png', width=10000, height=10000)

In [None]:
import graphviz
from sklearn import tree
from IPython.display import Image


tree_0 = rf.estimators_[0]
dot_data = tree.export_graphviz(tree_0, out_file=None,
                                feature_names=X_train.columns,
                                class_names=rf.classes_)
tree_graph = graphviz.Source(dot_data, format='png')
tree_graph.render('tmp', view=True)
Image('tmp.png', width=1000, height=1000)

In [None]:
from google.colab import files

tree_graph = graphviz.Source(dot_data, format='pdf')
tree_graph.render('rf_tree', view=True)
files.download('rf_tree.pdf')

In [None]:
plot_decision_pca(rf, X_test, y_test)

In [None]:
def pairwise_decision_boundary(model, X_train, y_train,
                               X_test, y_test,
                               first_column, second_column,
                               jitter=False):
    width, height = 1000, 1000
    # Create a class mapper to map from class string to an integer
    class_mapper = {c:i for i,c in enumerate(model.classes_)}
    
    x0 = X_train[first_column]
    x1 = X_train[second_column]
    # Get evenly spaced values between the min and max values
    x0_g = np.linspace(x0.min(), x0.max(), width)
    x1_g = np.linspace(x1.min(), x1.max(), height)
    
    # Create a "grid" of those evenly spaced values from each vector
    xx, yy = np.meshgrid(x0_g, x1_g)   
    # Stack together all of the sampled values
    X_grid = np.vstack([xx.ravel(), yy.ravel()]).T    
    
    model_c = clone(model)
    model_c.fit(X_train.loc[:,[first_column, second_column]], y_train)
    X_grid_labels = model_c.predict(X_grid)
    # Plot the predicted values
    j_x0, j_x1 = 0, 0
    if jitter:
      j_x0 = (np.random.random(X_test.shape[0])-0.5)/10.
      j_x1 = (np.random.random(X_test.shape[0])-0.5)/10.
    a = plt.scatter(X_test[first_column] + j_x0,
                    X_test[second_column] + j_x1,
                    c=[class_mapper[l] for l in y_test],
                    cmap=plt.cm.rainbow, 
                    edgecolor='k', vmin=0, vmax=3)
    plt.contourf(xx, yy,
                 np.reshape([class_mapper[l] for l in X_grid_labels],
                            (width, height)),
                cmap=a.cmap, alpha=0.5, levels=3)
    plt.title('Decision boundaries with true values overlaid')
    plt.xlabel(first_column)
    plt.ylabel(second_column)
    cb = plt.colorbar(ticks=[0.5, 1.2, 2, 2.8])
    cb.ax.set_yticklabels(model.classes_)

In [None]:
pairwise_decision_boundary(rf, X_train, y_train, X_test, y_test,
                           'age', 'income')

In [None]:
pairwise_decision_boundary(rf, X_train, y_train, X_test, y_test,
                           'age', 'kids', jitter=True)

In [None]:
pairwise_decision_boundary(rf, X_train, y_train, X_test, y_test,
                           'subscribe', 'kids', jitter=True)

In [None]:
pd.DataFrame(rf.predict_proba(X_test), columns=rf.classes_).sample(5)

In [None]:
chapter10.check_clusters(seg_sub, rf.predict(seg_sub))

In [None]:
chapter10.check_clusters(seg_sub, seg_labels)

### 11.1.3 Random forest variable importance

In [None]:
pd.Series(rf.feature_importances_,
          index=seg_sub.columns).sort_values(ascending=False)

## 11.2 Prediction: identifying potential customers

In [None]:
subscribe_label = seg_sub.subscribe

seg_sub_nosub = seg_sub.drop('subscribe', axis=1)

np.random.seed(7885)
rand_idx = np.random.rand(subscribe_label.shape[0])
train_idx = rand_idx <= 0.65
test_idx = rand_idx > 0.65

X_train = seg_sub_nosub.iloc[train_idx]
X_test = seg_sub_nosub.iloc[test_idx]

y_train = subscribe_label.iloc[train_idx]
y_test = subscribe_label.iloc[test_idx]

In [None]:
chapter10.cluster_plot(seg_sub_nosub, subscribe_label)

In [None]:
rf_sub = ensemble.RandomForestClassifier(n_estimators=100,
                                         random_state=86,
                                         class_weight=\
                                           'balanced_subsample')

rf_sub.fit(X_train, y_train)

y_pred = rf_sub.predict(X_test)

In [None]:
rf_sub.score(X_test, y_test)

In [None]:
metrics.f1_score(y_test, y_pred, average='micro')

In [None]:
confusion_matrix(y_test, y_pred, rf_sub)

In [None]:
from sklearn import model_selection

rf_sub_cv = ensemble.RandomForestClassifier(random_state=34,
                                            class_weight=\
                                              'balanced_subsample')
parameters = {'n_estimators': [10, 100, 500],
              'max_depth': [5, 10, 30],
              'min_samples_split': [2,5],
              'min_samples_leaf': [1,2,5]}
clf = model_selection.GridSearchCV(rf_sub_cv, parameters,
                                   cv=5, scoring='f1_weighted')
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
y_pred_be = clf.best_estimator_.predict(X_test)

confusion_matrix(y_test, y_pred_be, clf.best_estimator_)

In [None]:
rf_sub_cv = ensemble.RandomForestClassifier(random_state=34,
                                            class_weight=\
                                              'balanced_subsample')
parameters = {'n_estimators': [10, 100, 500],
              'max_depth': [5, 10, 30],
              'min_samples_split': [2,5],
              'min_samples_leaf': [1,2,5]}
clf = model_selection.GridSearchCV(rf_sub_cv, parameters,
                                   cv=5, scoring='recall')
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
y_pred_be = clf.best_estimator_.predict(X_test)

confusion_matrix(y_test, y_pred_be, clf.best_estimator_)

In [None]:
rf_sub = ensemble.RandomForestClassifier(n_estimators=10,
                                         random_state=86,
                                         max_depth=5,
                                         min_samples_leaf=2,
                                         min_samples_split=2,
                                         class_weight=\
                                           {False: 1, True:50})

rf_sub.fit(X_train, y_train)

y_pred = rf_sub.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred, rf_sub)