In [None]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

## 5.1 Simulating consumer segment data

In [None]:
import pandas as pd
segment_data = pd.read_csv('http://bit.ly/PMR-ch5')
segment_data.head()

In [None]:
segment_data.describe()

### 5.1.1 Segment data definition

In [None]:
segment_variables = ['age', 'gender', 'income', 'kids', 'own_home',
                     'subscribe']
segment_variables_distribution = dict(zip(segment_variables,
                                          ['normal', 'binomial',
                                           'normal','poisson',
                                           'binomial', 'binomial']))

segment_variables_distribution['age']

In [None]:
segment_variables_distribution

In [None]:
segment_means = {'suburb_mix': [40, 0.5, 55000, 2, 0.5, 0.1],
                 'urban_hip':  [24, 0.7, 21000, 1, 0.2, 0.2],
                 'travelers':  [58, 0.5, 64000, 0, 0.7, 0.05],
                 'moving_up':  [36, 0.3, 52000, 2, 0.3, 0.2]}

In [None]:
# standard deviations for each segment 
# None = not applicable for the variable)
segment_stddev = {'suburb_mix': [5, None, 12000, None, None, None],
                  'urban_hip':  [2, None, 5000, None, None, None],
                  'travelers':  [8, None, 21000, None, None, None],
                  'moving_up':  [4, None, 10000, None, None, None]}

In [None]:
segment_names = ['suburb_mix', 'urban_hip', 'travelers', 'moving_up']
segment_sizes = dict(zip(segment_names,[100, 50, 80, 70]))

segment_statistics = {}
for name in segment_names:
  segment_statistics[name] = {'size': segment_sizes[name]}
  for i, variable in enumerate(segment_variables):
    segment_statistics[name][variable] = {
        'mean': segment_means[name][i],
        'stddev': segment_stddev[name][i]
    }

In [None]:
segment_statistics['moving_up']

### 5.1.2 Final segment data generation

In [None]:
import numpy as np
import pandas as pd

np.random.seed(seed=2554)
segment_constructor = {}

# Iterate over segments to create data for each
for name in segment_names:
  segment_data_subset = {}
  print('segment: {0}'.format(name))
  # Within each segment, iterate over the variables and generate data
  for variable in segment_variables:
    print('\tvariable: {0}'.format(variable))
    if segment_variables_distribution[variable] == 'normal':
      # Draw random normals
      segment_data_subset[variable] = np.random.normal(
          loc=segment_statistics[name][variable]['mean'],
          scale=segment_statistics[name][variable]['stddev'],
          size=segment_statistics[name]['size']
      )
    elif segment_variables_distribution[variable] == 'poisson':
      # Draw counts
      segment_data_subset[variable] = np.random.poisson(
          lam=segment_statistics[name][variable]['mean'],
          size=segment_statistics[name]['size']
      )
    elif segment_variables_distribution[variable] == 'binomial':
      # Draw binomials
      segment_data_subset[variable] = np.random.binomial(
          n=1,
          p=segment_statistics[name][variable]['mean'],
          size=segment_statistics[name]['size']
      )
    else:
      # Data type unknown
      print('Bad segment data type: {0}'.format(
          segment_variables_distribution[j])
           )
      raise StopIteration
  segment_data_subset['Segment'] = np.repeat(
      name,
      repeats=segment_statistics[name]['size']
  )
  segment_constructor[name] = pd.DataFrame(segment_data_subset)
segment_data = pd.concat(segment_constructor.values())

In [None]:
segment_data.head()

In [None]:
name = 'suburb_mix'
variable = 'age'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.normal(
    loc=segment_statistics[name][variable]['mean'],
    scale=segment_statistics[name][variable]['stddev'],
    size=10
)

In [None]:
variable = 'kids'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.poisson(
    lam=segment_statistics[name][variable]['mean'],
    size=10
)

In [None]:
variable = 'gender'
print(segment_statistics[name][variable]['mean'])
print(segment_statistics[name][variable]['stddev'])
np.random.binomial(
    n=1,
    p=segment_statistics[name][variable]['mean'],
    size=10
)

In [None]:
np.repeat(name, repeats=10)

In [None]:
segment_data['gender'] = segment_data['gender'].apply(
    lambda x: 'male' if x else 'female'
)
segment_data['own_home'] = segment_data['own_home'].apply(
    lambda x: True if x else False
)
segment_data['subscribe'] = segment_data['subscribe'].apply(
    lambda x: True if x else False
)

In [None]:
segment_data.describe(include='all')

In [None]:
segment_data.head()

In [None]:
from google.colab import files
with open('segment_dataframe_Python_intro_Ch5.csv', 'w') as f:
  segment_data.to_csv(f)

files.download('segment_dataframe_Python_intro_Ch5.csv')

## 5.2 Finding descriptives by group

In [None]:
segment_data.loc[segment_data.Segment == 'moving_up']['income'].mean()

In [None]:
segment_data.loc[
    (segment_data['Segment'] == 'moving_up') &
    (segment_data['subscribe'] == False)
]['income'].mean()

In [None]:
segment_data.groupby('Segment')['income'].mean()

In [None]:
segment_data.groupby(['Segment', 'subscribe'])['income'].mean()

In [None]:
segment_data.groupby(
    ['Segment', 'subscribe']
)['income'].mean().unstack()

In [None]:
np.random.seed(4532)
segment_income = segment_data.groupby('Segment')['income'].mean()
segment_data = segment_data.join(segment_income,
                                 on='Segment',
                                 rsuffix='_segment')
segment_data.head(5)

In [None]:
segment_data.drop(labels='income_segment', axis=1, inplace=True)
segment_data.head(5)

### 5.2.1 Descriptives for two-way groups

In [None]:
segment_data.groupby(['Segment', 'own_home'])['income'].mean()

In [None]:
segment_data.groupby(
    ['Segment', 'own_home', 'subscribe']
)['income'].mean()

In [None]:
segment_data.groupby(
    ['Segment', 'own_home', 'subscribe']
)['income'].mean().unstack()

In [None]:
segment_data.groupby(
    ['Segment', 'own_home']
)['subscribe'].count().unstack()

In [None]:
segment_data.groupby(
    ['kids', 'Segment']
).subscribe.count().unstack(level=1)

In [None]:
pd.crosstab(segment_data['kids'], segment_data['Segment'])

In [None]:
segment_data.groupby('Segment')['kids'].sum()

### 5.2.2 Visualization by group: frequencies and proportions

In [None]:
import matplotlib.pyplot as plt

segments_groupby_segments = segment_data.groupby(['Segment']) 
segments_groupby_segments['subscribe'].value_counts().unstack().plot(
    kind='barh',
    figsize=(8, 8)
)
plt.xlabel('counts')

In [None]:
segments_groupby_segments['subscribe'].value_counts(
    normalize=True
).unstack().plot(
    kind='barh',
    figsize=(8, 8)
)
plt.xlabel('proportion of segment')

In [None]:
segment_data.groupby(['subscribe'])['Segment'].value_counts(
    normalize=True
).unstack().plot(kind='barh', figsize=(8, 8))
plt.xlabel('proportion of subscribers')

In [None]:
import seaborn as sns
sns.barplot(y='Segment', x='subscribe', data=segment_data,
            orient='h', ci=None)

In [None]:
g = sns.FacetGrid(segment_data, col='Segment')
g.map(sns.barplot, 'subscribe', orient='v', ci=None)

In [None]:
g = sns.FacetGrid(segment_data, col='Segment', row='own_home')
g.map(sns.barplot, 'subscribe', orient='v', ci=None)

In [None]:
segment_data.groupby(['Segment'])['income'].mean().plot.bar()

In [None]:
sns.barplot(x='Segment', y='income', data=segment_data, color='.6', 
            estimator=np.mean, ci=95)

In [None]:
segment_data.groupby(
    ['Segment', 'own_home']
)['income'].mean().unstack().plot.bar()

In [None]:
sns.barplot(x='Segment', y='income', hue='own_home',
            data=segment_data, estimator=np.mean, ci=95)

In [None]:
x = segment_data.groupby('Segment')['income'].apply(list)
_ = plt.boxplot(x=x.values, labels=x.index)

In [None]:
sns.boxplot(x='Segment', y='income', data=segment_data,
            color='0.7', orient='v')

In [None]:
sns.boxplot(y='Segment', x='income', data=segment_data, 
            color='0.7', orient='h')

In [None]:
sns.boxplot(y='Segment', x='income', hue='own_home',
            data=segment_data, color='0.7', orient='h')