In [1]:
%matplotlib inline
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
from requests import get
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# Import Data

In [93]:
dfList = []
for r, d, f in os.walk('../data/dunnhumby - The Complete Journey CSV/'):
    for file in f:
        if '.csv' in file:
            print(file)
            dfList.append(pd.read_csv(os.path.join(r, file)))

campaign_desc.csv
campaign_table.csv
causal_data.csv
coupon.csv
coupon_redempt.csv
hh_demographic.csv
product.csv
transaction_data.csv


In [94]:
campaign_desc_df = dfList[0]
campaign_table_df = dfList[1]
causal_data_df = dfList[2]
coupon_df = dfList[3]
coupon_redempt_df = dfList[4]
hh_demographic_df = dfList[5]
product_df = dfList[6]
transaction_data_df = dfList[7]

# Observing Data

In [95]:
campaign_desc_df.head(2)

Unnamed: 0,DESCRIPTION,CAMPAIGN,START_DAY,END_DAY
0,TypeB,24,659,719
1,TypeC,15,547,708


In [96]:
campaign_table_df.head(2)

Unnamed: 0,DESCRIPTION,household_key,CAMPAIGN
0,TypeA,17,26
1,TypeA,27,26


In [97]:
causal_data_df.head(2)

Unnamed: 0,PRODUCT_ID,STORE_ID,WEEK_NO,display,mailer
0,26190,286,70,0,A
1,26190,288,70,0,A


In [98]:
coupon_df.head(2)

Unnamed: 0,COUPON_UPC,PRODUCT_ID,CAMPAIGN
0,10000089061,27160,4
1,10000089064,27754,9


In [99]:
coupon_redempt_df.head(2)

Unnamed: 0,household_key,DAY,COUPON_UPC,CAMPAIGN
0,1,421,10000085364,8
1,1,421,51700010076,8


In [100]:
hh_demographic_df.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


This seems to be interesting data about households

## Product Data

In [101]:
product_df.head(10)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
5,26426,69,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS,2.5 OZ
6,26540,69,GROCERY,Private,COOKIES/CONES,TRAY PACK/CHOC CHIP COOKIES,16 OZ
7,26601,69,DRUG GM,Private,VITAMINS,VITAMIN - MINERALS,300CT(1)
8,26636,69,PASTRY,Private,BREAKFAST SWEETS,SW GDS: SW ROLLS/DAN,
9,26691,16,GROCERY,Private,PNT BTR/JELLY/JAMS,HONEY,12 OZ


In [103]:
product_df[product_df['DEPARTMENT']=='GROCERY'].groupby('COMMODITY_DESC')['PRODUCT_ID'].count().sort_values(ascending=False)

COMMODITY_DESC
SOFT DRINKS               1704
BAG SNACKS                1523
HISPANIC                  1460
FRZN MEAT/MEAT DINNERS    1268
BAKED BREAD/BUNS/ROLLS    1169
                          ... 
BIRD SEED                   29
FROZEN CHICKEN              27
COUPON/MISC ITEMS           17
FRZN SEAFOOD                11
BOTTLE DEPOSITS             10
Name: PRODUCT_ID, Length: 94, dtype: int64

Let us look at what kind of 'Drug GM' products we have

In [104]:
product_df[product_df['DEPARTMENT']=='DRUG GM'].groupby('COMMODITY_DESC')['PRODUCT_ID'].count().sort_values(ascending=False)

COMMODITY_DESC
GREETING CARDS/WRAP/PARTY SPLY    2785
CANDY - PACKAGED                  2473
HAIR CARE PRODUCTS                1744
STATIONERY & SCHOOL SUPPLIES      1261
MAGAZINE                          1224
                                  ... 
COUPON/MISC ITEMS                    5
PROPANE                              3
MISCELLANEOUS HBC                    3
FRAGRANCES                           1
FD WRAPS/BAGS/TRSH BG                1
Name: PRODUCT_ID, Length: 91, dtype: int64

# Downloaded food nutrients data

In [92]:
dfList = []
for r, d, f in os.walk('../data/'):
    for file in f:
        if '.csv' in file:
            print(file)
            dfList.append(pd.read_csv(os.path.join(r, file)))
            
branded_food_df = dfList[0]
nutrient_df = dfList[1]
food_df = dfList[2]
food_nutrients_df = dfList[3]
food_category_df = dfList[4]

branded_food.csv
nutrient.csv
food.csv
food_nutrient.csv
food_category.csv


In [93]:
#drop unnecessary columns and rename to be more understandable
food_nutrients_small_df = food_nutrients_df.drop(["data_points","min","max","median","footnote","min_year_acquired","derivation_id"],axis=1)
nutrient_small_df = nutrient_df.drop(["nutrient_nbr","rank"],axis=1)
food_category_df.drop(["code"],axis=1,inplace=True)
food_category_df.rename(columns={'id':'food_category_id','description':'category'},inplace= True)
food_df.drop(["publication_date"],axis=1,inplace=True)

In [94]:
# join the df to see which id corresponds to which nutrients
joined_df = food_nutrients_small_df.join(nutrient_small_df.set_index('id'),on='nutrient_id',how='left')
joined_df.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount,name,unit_name
0,4178832,346049,1079,0.0,"Fiber, total dietary",G
1,4178833,346049,1087,0.0,"Calcium, Ca",MG
2,4178834,346049,1089,2.57,"Iron, Fe",MG
3,4178835,346049,1104,0.0,"Vitamin A, IU",IU
4,4178836,346049,1162,0.0,"Vitamin C, total ascorbic acid",MG


In [95]:
#index the resulting table by multiindex: product id -> name of nutrients
structured_df = joined_df.set_index(pd.MultiIndex.from_frame(joined_df[['fdc_id','name']]))
#drop unnecessary columns 
structured_small_df = structured_df.drop(["id","fdc_id","nutrient_id","name"],axis=1)

In [96]:
structured_small_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,unit_name
fdc_id,name,Unnamed: 2_level_1,Unnamed: 3_level_1
346049,"Fiber, total dietary",0.0,G
346049,"Calcium, Ca",0.0,MG
346049,"Iron, Fe",2.57,MG
346049,"Vitamin A, IU",0.0,IU
346049,"Vitamin C, total ascorbic acid",0.0,MG


In [98]:
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id
0,346049,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
1,346050,branded_food,"LIBBYS Corned Beef With Chili, 12 OZ",
2,346464,branded_food,WOLF Chili Without Beans,
3,346466,branded_food,"WOLF Turkey Chili No Beans, 15 OZ",
4,346468,branded_food,"WOLF BRAND Chili With Beans, 24 oz., 24 OZ",


In [99]:
food_with_category_df = food_df.join(food_category_df.set_index("food_category_id"),on="food_category_id",how="left")
food_with_category_df.drop(["food_category_id"],axis=1,inplace=True)

In [100]:
food_with_category_df.head()

Unnamed: 0,fdc_id,data_type,description,category
0,346049,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
1,346050,branded_food,"LIBBYS Corned Beef With Chili, 12 OZ",
2,346464,branded_food,WOLF Chili Without Beans,
3,346466,branded_food,"WOLF Turkey Chili No Beans, 15 OZ",
4,346468,branded_food,"WOLF BRAND Chili With Beans, 24 oz., 24 OZ",


In [101]:
structured_name_df = structured_small_df.join(food_with_category_df.set_index('fdc_id'),on='fdc_id',how='left')

In [104]:
structured_name_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,unit_name,data_type,description,category
fdc_id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
346049,"Fiber, total dietary",0.0,G,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
346049,"Calcium, Ca",0.0,MG,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
346049,"Iron, Fe",2.57,MG,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
346049,"Vitamin A, IU",0.0,IU,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
346049,"Vitamin C, total ascorbic acid",0.0,MG,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
