In [None]:
%matplotlib inline
import re
import os
import numpy as np
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# Import Data

In [None]:
dfList = []
for r, d, f in os.walk('../data/dunnhumby - The Complete Journey CSV/'):
    for file in f:
        if '.csv' in file:
            print(file)
            dfList.append(pd.read_csv(os.path.join(r, file)))

In [None]:
campaign_desc_df = dfList[0]
campaign_table_df = dfList[1]
causal_data_df = dfList[2]
coupon_df = dfList[3]
coupon_redempt_df = dfList[4]
hh_demographic_df = dfList[5]
product_df = dfList[6]
transaction_data_df = dfList[7]

# Observing Data

In [None]:
campaign_desc_df.head(2)

In [None]:
campaign_table_df.head(2)

In [None]:
causal_data_df.head(2)

In [None]:
coupon_df.head(2)

In [None]:
coupon_redempt_df.head(2)

In [None]:
hh_demographic_df.head(2)

This seems to be interesting data about households

## Product Data

In [None]:
product_df.head(10)

In [None]:
product_df.groupby('DEPARTMENT')['PRODUCT_ID'].count().plot.bar(rot=90,figsize=(9,5))

Let us look at what kind of grocery items we can find

In [None]:
product_df[product_df['DEPARTMENT']=='GROCERY'].groupby('COMMODITY_DESC')['PRODUCT_ID'].count().sort_values(ascending=False)

Let us look at what kind of 'Drug GM' products we have

In [None]:
product_df[product_df['DEPARTMENT']=='DRUG GM'].groupby('COMMODITY_DESC')['PRODUCT_ID'].count().sort_values(ascending=False)

## Transaction Data

In [None]:
transaction_data_df.head()

# Question 1: How is purchasing rate related to income?

If we want to compare shopping amounts amongst households, we are subject to transaction amounts and the time in which they took place. We can compare the shopping rate amongst households by taking the derivative of the sales value over the cumulative days in which the shopping occured 

In [None]:
# turn the days of purchase into a list
days_purchased_each_house=transaction_data_df.groupby(['household_key','DAY','STORE_ID'])['SALES_VALUE']\
.sum()\
.reset_index()\
.set_index('household_key')\
.groupby('household_key')['DAY'].apply(list)
days_purchased_each_house.head()


In [None]:
#turn the transaction values into a list
transaction_val_each_house=transaction_data_df.groupby(['household_key','DAY','STORE_ID'])['SALES_VALUE']\
.sum()\
.reset_index()\
.set_index('household_key')\
.groupby('household_key')['SALES_VALUE'].apply(list)
transaction_val_each_house.head()

In [None]:
#merge
transaction_freq_df = pd.merge(transaction_val_each_house,days_purchased_each_house,\
                               left_on='household_key',right_on='household_key')


In [None]:
transaction_freq_df.head()
    

In [None]:
def compute_transaction_rate(df):
    x = np.array(df['DAY'])
    #Compute cumulative sum of sales value
    y = []
    sum_ = 0
    for transaction in df['SALES_VALUE']:
        sum_=sum_+transaction
        y.append(sum_)
    y = np.array(y)
    return np.polyfit(x, y, 1)[0]

In [None]:
purchase_rate=transaction_freq_df.apply(compute_transaction_rate,axis=1)
purchase_rate.hist(bins=100)
ax = plt.gca()
ax.set_title('Purchasing Rate vs. Household')
ax.set_xlabel('Household')
ax.set_ylabel('Purchasing rate ($/day)')


In [None]:
purchase_rate.rename('purchase_rate',inplace=True)
hh_demographic_df = hh_demographic_df.join(purchase_rate,on='household_key')

In [None]:
hh_demographic_df.groupby('HOUSEHOLD_SIZE_DESC')['purchase_rate'].agg('mean').plot.bar()
ax = plt.gca()
ax.set_title('Purchasing Rate vs. Household Size')
ax.set_xlabel('Household')
ax.set_ylabel('Purchasing rate ($/day)')

In [None]:
household_size = hh_demographic_df['HOUSEHOLD_SIZE_DESC'].replace('5+','6')