## 8.1 Handling Highly Correlated Variables

### An Initial Linear Model of Online Spend

In [0]:
import pandas as pd
cust_df = pd.read_csv('http://bit.ly/PMR-ch8pt1')
cust_df.head() # Not shown
cust_df.describe(include='all') # Not shown

In [0]:
import statsmodels.formula.api as smf
spend_m1 = smf.ols('online_spend ~ age + credit_score + email'
                    '+ distance_to_store + online_visits'
                    '+ online_trans + store_trans + store_spend '
                    '+ sat_service + sat_selection',
                    data=cust_df.loc[cust_df.online_spend > 0,
                                     'age':]).fit()
spend_m1.summary()

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('paper')

g = sns.PairGrid(cust_df.loc[:, 'age':].fillna(-1), height=1.1)
g.map_upper(plt.scatter, linewidths=1, edgecolor="w", s=5, alpha=0.5)
g.map_diag(plt.hist)
g.map_lower(sns.kdeplot)

In [0]:
import scipy.stats as ss
import sklearn.preprocessing as pp

def autotransform(x):
  '''Return scaled Box-Cox transform of x'''
  x_bc, lmbd = ss.boxcox(1 + x)
  return pp.scale(x_bc)

In [0]:
cust_df.head()

In [0]:
idx_complete = (cust_df.isna().sum(axis=1) == 0)
cust_df_bc = cust_df.loc[(idx_complete) &
                         (cust_df.online_spend > 0), 'age':].copy()
col_idx = cust_df_bc.columns != 'email'
cust_df_bc.iloc[:, col_idx] = \
  cust_df_bc.iloc[:,col_idx].apply(autotransform)

In [0]:
cust_df_bc.describe(include='all')

In [0]:
g = sns.PairGrid(cust_df_bc, height=1.1,)
g.map_upper(plt.scatter, linewidths=1, edgecolor="w", s=5, alpha=0.5)
g.map_diag(plt.hist)
g.map_lower(sns.kdeplot) # Not shown

In [0]:
spend_m2 = smf.ols('online_spend ~ age + credit_score + email'
                    '+ distance_to_store + online_visits'
                    '+ online_trans + store_trans + store_spend '
                    '+ sat_service + sat_selection',
                    data=cust_df_bc).fit()
spend_m2.summary()

In [0]:
spend_m3 = smf.ols('online_spend ~ online_trans',
                    data=cust_df_bc).fit()
from statsmodels.stats import anova as sms_anova
sms_anova.anova_lm(spend_m2, spend_m3)

### Remediating Collinearity

In [0]:
from statsmodels.stats.outliers_influence \
  import variance_inflation_factor

variance_inflation_factor(spend_m2.model.exog, 1)

In [0]:
for i, param in enumerate(spend_m2.params.index):
  print('VIF: {:.3f}, Parameter: {}'.format(
      variance_inflation_factor(spend_m2.model.exog, i), param))

In [0]:
spend_m4 = smf.ols('online_spend ~ age + credit_score + email'
                    '+ distance_to_store + online_visits'
                    '+ store_spend + sat_service + sat_selection',
                    data=cust_df_bc).fit()
spend_m4.summary()

In [0]:
for i, param in enumerate(spend_m4.params.index):
  print('VIF: {:.3f}, Parameter: {}'.format(
      variance_inflation_factor(spend_m4.model.exog, i), param))

In [0]:
from sklearn import decomposition

# Create a combined online variable using PCA
online_pca = (
    decomposition.PCA()
    .fit_transform(
      cust_df_bc[['online_visits','online_trans']]))
cust_df_bc['online'] = online_pca[:,0]

# Create a combined store variable using PCA
store_pca = (
    decomposition.PCA().
    fit_transform(
        cust_df_bc[['store_spend', 'store_trans']]))
cust_df_bc['store'] = store_pca[:,0]

In [0]:
from sklearn import decomposition

# Create a combined online variable using PCA
online_pca = decomposition.PCA().\
  fit_transform(cust_df_bc[['online_visits','online_trans']])
cust_df_bc['online'] = online_pca[:,0]

# Create a combined store variable using PCA
store_pca = decomposition.PCA().\
  fit_transform(cust_df_bc[['store_spend',
                            'store_trans']])
cust_df_bc['store'] = store_pca[:,0]

In [0]:
spend_m5 = smf.ols('online_spend ~ age + credit_score + email'
                   '+ distance_to_store + online + store'
                   '+ sat_service + sat_selection',
                    data=cust_df_bc).fit()
spend_m5.summary()

In [0]:
for i, param in enumerate(spend_m5.params.index):
  print('VIF: {:.3f}, Parameter: {}'.format(
      variance_inflation_factor(spend_m5.model.exog, i), param))

## 8.2 Linear Models for Binary Outcomes: Logistic Regression

### 8.2.1 Basics of the Logistic Regression Model

In [0]:
import numpy as np
np.exp(0) / ( np.exp(0) + 1 )

In [0]:
from scipy.special import expit
expit(0)

In [0]:
expit(-np.inf) # infinitely low = likelihood 0

In [0]:
expit(2) # moderate probability = 88% chance of outcome

In [0]:
expit(-0.2) # weak likelihood

In [0]:
np.log(0.88/(1-0.88)) # moderate high likelihood

In [0]:
from scipy.special import logit
logit(0.88) # equivalent to hand computation

### 8.2.2 Data for Logistic Regression of Season Passes

In [0]:
pass_df = pd.read_csv('http://bit.ly/PMR-ch8pt2') # TODO REFORMAT THIS TO MATCH
pass_df.Pass = pass_df.Pass.astype(
    pd.api.types.CategoricalDtype(categories=['YesPass','NoPass'],
                                  ordered=True))
pass_df.Promo = pass_df.Promo.astype(
    pd.api.types.CategoricalDtype(categories=['NoBundle','Bundle'],
                                  ordered=True))
pass_df.head()

In [0]:
pass_df.describe()

### 8.2.3 Sales Table Data

In [0]:
pass_df.groupby(['Pass', 'Promo', 'Channel']).Pass.count().unstack(level=2).T

In [0]:
channels = ['Mail', 'Park', 'Email']
passes = ['NoPass','YesPass']
promos = ['NoBundle', 'Bundle']

In [0]:
pass_counts = [278, 449, 359, 242, 49, 223, 284, 639, 485, 83, 27, 38]

In [0]:
i = 0
pass_array = []
for c in channels:
  for p in passes:
    for b in promos:
      pass_array.append(np.repeat([[c, b, p]], pass_counts[i],
                                  axis=0))
      i += 1

In [0]:
pass_df = pd.DataFrame(np.concatenate(pass_array),
                       columns=['Channel', 'Promo', 'Pass'])
pass_df.head()

In [0]:
pass_df.groupby(['Pass', 'Promo', 'Channel']).Pass.count()\
  .unstack(level=2).T

In [0]:
pass_df.groupby(['Pass', 'Promo']).Pass.count().unstack(level=1)

In [0]:
pass_df.Pass = pass_df.Pass.astype(
    pd.api.types.CategoricalDtype(categories=['YesPass','NoPass'],
                                  ordered=True))
pass_df.Promo = pass_df.Promo.astype(
    pd.api.types.CategoricalDtype(categories=['NoBundle','Bundle'],
                                  ordered=True))

### 8.2.4 Fitting a Logistic Regression Model

In [0]:
import statsmodels.api as sm
pass_m1 = smf.glm('Pass ~ Promo', data=pass_df,
                  family=sm.families.Binomial()).fit()
pass_m1.summary()

In [0]:
# ratio of outcome % to alternative %
expit(0.3888) / (1-expit(0.3888))

In [0]:
np.exp(0.3888) # identical

In [0]:
print('Odds of pass:no pass, bundle: {:.3f} : 1'
  .format(np.exp(0.3888 - 0.1922)))
print('Odds of pass:no pass, without bundle: {:.3f} : 1'
  .format(np.exp(-0.1922)))

In [0]:
np.exp(0.3888 - 0.1922)/(1 + np.exp(0.3888 - 0.1922))

In [0]:
print('Probability of pass, bundle: {:.3f}'
  .format(np.exp(0.3888 - 0.1922)/(1 + np.exp(0.3888 - 0.1922))))
print('Probability of pass, no bundle: {:.3f}'
  .format(np.exp(-0.1922)/(1 + np.exp(-0.1922))))
print('Bundle increased probability by {:.3f}'
  .format((np.exp(0.3888 - 0.1922)/(1 + np.exp(0.3888 - 0.1922)))/
          (np.exp(-0.1922)/(1 + np.exp(-0.1922)))))

In [0]:
np.exp(pass_m1.params)

In [0]:
np.exp(pass_m1.conf_int())

### 8.2.5 Reconsidering the model

In [0]:
pass_df.groupby(['Pass']).Channel.value_counts().unstack()

In [0]:
pass_df.groupby(['Pass']).Channel.value_counts().unstack()\
  .plot(kind='barh', stacked=True, figsize=(10,6))

In [0]:
f = plt.figure(figsize=(15,8))
axs = f.subplots(1,3)
for ax, (c, channel_group) in zip(axs, pass_df.groupby("Channel")):
  ax = channel_group\
   .groupby("Promo")\
   .Pass.value_counts(normalize=True).unstack()\
   .plot(kind='bar', ax=ax, stacked=True)
  ax.set_title(c)

In [0]:
plt.figure(figsize=(15,8))
for i,c in enumerate(channels):
  ax = plt.subplot(1, 3, i+1)
  pass_df.loc[pass_df.Channel == c]\
    .groupby('Promo').Pass.value_counts(normalize=True).unstack()\
    .plot(kind='bar', ax=ax, stacked=True)
  plt.title(c)
  plt.ylim((0,1.3))

In [0]:
pass_m2 = smf.glm('Pass ~ Promo + Channel',
                  data=pass_df,
                  family=sm.families.Binomial()).fit()
pass_m2.summary()

In [0]:
np.exp(pass_m2.params)

In [0]:
np.exp(pass_m2.conf_int())

In [0]:
pass_m3 = smf.glm('Pass ~ Promo + Channel + Promo:Channel',
                  data=pass_df,
                  family=sm.families.Binomial()).fit()
pass_m3.summary()

In [0]:
np.exp(pass_m3.conf_int())