In [None]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#plt.xkcd()

# Import Data

In [None]:
df_ppl = pd.read_csv('../input/people.csv')
df_act_train = pd.read_csv('../input/act_train.csv')
df_act_test = pd.read_csv('../input/act_test.csv')

new_columns = []
for col in df_ppl.columns:
    if 'char' in col or 'date' in col:
        new_columns.append('ppl_' + col)
    else:
        new_columns.append(col)
df_ppl.columns = new_columns

new_columns = []
for col in df_act_train.columns:
    if 'char' in col or 'date' in col:
        new_columns.append('act_' + col)
    else:
        new_columns.append(col)
df_act_train.columns = new_columns
del(new_columns[-1])
df_act_test.columns = new_columns

df = pd.merge(df_act_train, df_ppl, on='people_id')
df_validate = pd.merge(df_act_test, df_ppl, on='people_id')

df.to_csv('merged.csv')
df_validate.to_csv('merged_test.csv')

del(df_ppl, df_act_train, df_act_test)

print('Memory usage of training DataFrame: ' + str(sum(df.memory_usage())))

In [None]:
print('Columns: ' + str(df.columns))

# Exploratory Data Analysis

First, let's look at outcomes.

In [None]:
sns.countplot(x='outcome', data=df)
plt.suptitle('Customer Value - Binary Outcomes', fontsize=20)
plt.show()

That's a very high positive outcome rate. I wonder what a positive outcome is defined as? Surely it can't be lead conversions. 

Let's take a broad look at the characteristics.

In [None]:
row_counts = []
for col in df.columns:
    rows = len(df[col].value_counts())
    row_counts.append((col + ': ' + str(rows) + ' unique values.', rows))
row_counts.sort(key=lambda tup: tup[1], reverse=True)
for col in row_counts:
    print(col[0])

We can see how many unique values each column has. It looks like there are quite a few actions per person, but the variety of activity charactertistics is low. Most activities categories are binary, but there is one outlier: Activity Characteristic 10. I want to say that this is the primary activity category, like web page visited or some specific action on the website as a whole, and the other categories are modifiers. This would make more sense if it was activity number 1, and there isn't really any data to support this theory. 

In [None]:
def null_percentage(column):
    df_name = column.name
    nans = np.count_nonzero(column.isnull().values)
    total = column.size
    frac = nans / total
    perc = int(frac * 100)
    print('%d%% of values or %d missing from %s column.' % (perc, nans, df_name))

def check_null(df, columns):
    for col in columns:
        null_percentage(df[col])
        
check_null(df, df.columns)

Okay, this is more interesting. People characteristics have no missing values, but activities have a ton of NaN values. Most activities have characteristic 10, but the others are rare. The wierd part is that the other characteristics are equal in volume. This might mean that they are a subcategory of one or more char_10s, or that combined they are a category independent of char_10. I added the number of char_10 values and other values and got the total number of rows, so my best guess so far is they make up a complex characteristic when char_10 doesn't apply. 

In [None]:
overlap_count = 0
for non_null_feature in [df.act_char_9.notnull(), 
                df.act_char_8.notnull(), 
                df.act_char_7.notnull(), 
                df.act_char_6.notnull(), 
                df.act_char_5.notnull(),
                df.act_char_4.notnull(),
                df.act_char_3.notnull(),
                df.act_char_2.notnull(),
                df.act_char_1.notnull()]:
    overlap_count += df.loc[df.act_char_10.notnull() & non_null_feature].shape[0]
print('%d rows have overlap between char_10 and any other characteristic features.' % overlap_count)

In [None]:
overlap = df.loc[df.act_char_9.notnull() & df.act_char_8.notnull() & df.act_char_7.notnull() & 
                 df.act_char_6.notnull() & df.act_char_5.notnull() & df.act_char_4.notnull() & 
                 df.act_char_3.notnull() & df.act_char_2.notnull() & df.act_char_1.notnull()]
print('%d rows have overlap between ALL characteristic columns besides char_10.' % overlap.shape[0])
del(overlap)

Testing this hypothesis, we can see that there is zero overlap between char_10 and the others and that all of the other characteristics overlap completely. 

Now let's look at the date values. Convert them to datetime, add a "day of the week" featue as well as years, months, and days. 

In [None]:
df.act_date = df.act_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
#df.ppl_date = df.ppl_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
#df['date_diff'] = df.act_date.sub(df.ppl_date, axis=0)
#df.date_diff = df.date_diff.apply(lambda x: int(x.days))

weekday_map = {0:'1 Sunday', 1:'2 Monday', 2:'3 Tuesday', 3:'4 Wednesday', 4:'5 Thursday', 5:'6 Friday', 
              6:'7 Saturday'}
#df['ppl_weekday'] = df.ppl_date.apply(lambda x: x.weekday())
#df.ppl_weekday = df.ppl_weekday.replace(weekday_map)
df['act_weekday'] = df.act_date.apply(lambda x: x.weekday())
df.act_weekday = df.act_weekday.replace(weekday_map)

In [None]:
df['act_year'] = df.act_date.apply(lambda x: x.year)
df['act_month'] = df.act_date.apply(lambda x: x.month)
df['act_day'] = df.act_date.apply(lambda x: x.day)

In [None]:
tab = pd.crosstab(df.act_weekday, df.outcome)
tab['ratio'] = tab[0] + tab[1]
tab.ratio = (tab[1] / tab.ratio) * 100
bar = sns.barplot(x = list(tab.index), y = list(tab.ratio))
bar.set(ylabel="Percentage", xlabel="Day of Week")
plt.xticks(rotation = 45)
plt.show()

print('Range: ' + str(max(list(tab.ratio))-min(list(tab.ratio))))

In [None]:
bar = sns.barplot(x = list(weekday_map.values()), y = list(df.act_weekday.value_counts().sort_index()))
plt.xticks(rotation = 45)
plt.show()

In [None]:
def crosstab_heatmap(*args, title='', size=(6.4, 4.8), ant=True, color='Blues'):
    tab = pd.crosstab(*args)
    plt.figure(title, figsize=size)
    plt.title(title)
    hmap = sns.heatmap(tab, annot=ant, fmt='g', cmap=color)
    loc, ylabels = plt.yticks()
    #hmap.set_xticklabels(labels, rotation=45)
    hmap.set_yticklabels(ylabels, rotation=45)
    plt.show()

crosstab_heatmap(df.act_weekday, df.outcome)

In [None]:
crosstab_heatmap(df.act_year, df.act_month, title='chart', size=(20,6))

In [None]:
crosstab_heatmap(df.act_month, df.act_day, title='chart', ant=False, size=(15,8), color='YlGnBu')

The hotspots in September and October are interesting. They don't line up with overlapping months. I was expecting hot spots in August, as that month has overlap between two years. This doesn't line up with any holidays that I know of, and given that I don't know the actual year I can't say if they correspond to a marketing event. 

Both activities and positive outcomes are more likely as the week progresses, petering off after Friday. The success ratio is lower on Sunday and Monday, but the correlation between success for a given day of the week and the volume 

In [None]:
def plot_ecdf(data, label):
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    _ = plt.plot(x, y, marker='.', linestyle='none')
    _ = plt.xlabel(label)
    _ = plt.ylabel('ECDF')
    plt.margins(0.02)
    plt.show()


In [None]:
def percentages(data, top=10):
    s = data.iloc[:,0] 
    s = s.value_counts()
    s = s.index
    s = s[0:top]
    col = data.columns[0]
    data = data.loc[df[col].isin(s)]
    tab = pd.crosstab(data.iloc[:,0], data.iloc[:,1]).apply(lambda r: r/r.sum(), axis=1)
    tab.plot(kind='bar', stacked=True, color=['red','blue'], grid=False, figsize=(30, 8), legend=None)
    plt.show()

Plot the volume of both people and activity dates where they line up. 

In [None]:
len(df.group_1.value_counts())

In [None]:
percentages(df[['group_1', 'outcome']], top = 10)

In [None]:
df.group_1.value_counts().head(10)

It looks like group_1s have 100% correlation with outcome. Looks suspicious, though. Maybe in their CRM system, their group assignment changes AFTER the outcome. 

In [None]:
percentages(df[['group_1', 'outcome']], top = 200)

So it seems like most groups have 100% correlation with outcome. 

In [None]:
overlap = 0
df_groups, df_test_groups = df.group_1.value_counts().index, df_validate.group_1.value_counts().index
for group in df_groups:
    if group in df_test_groups:
        overlap += 1
print('Trainign set groups: %d' % len(df_groups))
print('Test set groups: ' + str(len(df_test_groups)))
print('Overlap: ' + str(overlap))

Not all of the groups in the final submission dataset appear in the training data. 

In [None]:
overlap = 0
df_groups, df_test_groups = df.group_1.value_counts().index[0:1000], df_validate.group_1.value_counts()[0:1000].index
for group in df_groups:
    if group in df_test_groups:
        overlap += 1
print('Trainign set groups: %d' % len(df_groups))
print('Test set groups: %d' % len(df_test_groups))
print('Overlap: ' + str(overlap))

I thought maybe there would be more overlap between the top 1000 groups in the training and submission datasets, but it's only around 40%. 

In [None]:
print('Training top five:')
print(df.group_1.value_counts().head())
print()
print('Test top five:')
print(df_validate.group_1.value_counts().head())

The most popular group matches, and they are the most populr feature by a wide margine. I'm thinking as far as feature engineering goes, group_1 might need to be separated into just two categories: group 17304 and not group 17304. 

In [None]:
activities = list(df.columns[5:15])
#print(activities)

people = sorted(df.columns)
people = people[21:59] 
#people.remove('people_id')
#print(people)

def perc_tab(data, i, j, top=10):
    s = data.iloc[:,0] 
    s = s.value_counts()
    s = s.index
    s = s[0:top]
    col = data.columns[0]
    #print(col)
    data = data.loc[df[col].isin(s)]
    tab = pd.crosstab(data.iloc[:,0], data.iloc[:,1]).apply(lambda r: r/r.sum(), axis=1)
    tab.plot(kind='bar', stacked=True, color=['red','blue'], grid=False, ax=axes[i, j], legend=None)


Let's take a look at activity_category. 

In [None]:
crosstab_heatmap(df.activity_category, df.outcome)

The activity category is probably higher level than activity characteristics. Different activity categories have noticeably different rates of positive outcomes. 

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(25,12))
i = 0
j = 0
for c, col in enumerate(activities):
    if c > 4:
        i = c % 5
    else:
        i = c
    if c > 0 and c % 5 == 0:
        j += 1
    #print(str(i) + ' ' + str(j))

    perc_tab(df[[col, 'outcome']], j, i, top = 10)
plt.show()

Outcomes seem to be evenly distributed accross all act_char subtypes, except for type 7  under characteristic 5. act_char_6 doesn't have subtypes.  

In [None]:
fig, axes = plt.subplots(nrows=8, ncols=5, figsize=(25,45))
i = 0
j = 0
for c, col in enumerate(people):
    if c > 4:
        i = c % 5
    else:
        i = c
    if c > 0 and c % 5 == 0:
        j += 1
    #print(str(i) + ' ' + str(j))

    perc_tab(df[[col, 'outcome']], j, i, top = 10)
plt.show()

Same story with people characteristics. Interestingly, true is always more likely to have a positive outcome in true/false categories. 

Let's dig deeper into people_id. I'm most interested in seeing whether outcome is an attribute that sticks to a person or can vary between activities. 

In [None]:
print(df.people_id.value_counts().head(15))
power_users = df.people_id.value_counts().head(500).index

In [None]:
df_power = df[df.people_id.isin(list(power_users))]
df_power.head()

Lots of actions from a small number of power users. 

In [None]:
percentages(df_power[['people_id', 'outcome']], top = 13)

In [None]:
percentages(df[['people_id', 'outcome']], top = 150)

In [None]:
del(df_power)

In [None]:
one_person = df.loc[df.people_id == 'ppl_337688']
one_person.iloc[0:20]

Some users always have the same outcome, while other users have mixed outcomes. I would guess that the top dozen or so power users, or at least the top 4 with tens of thousands of hits with no outcomes, are site administators, web developers, and web crawlers.  

Characteristics can differ for the same person. 