In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from datetime import datetime 

In [None]:
file = '../input/BreadBasket_DMS.csv'

In [None]:
df = pd.read_csv(file)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

# Goals & Objectives  

What am I trying accomplish with this data-set? We should have a goal to strive for instead of blindly analyzing. Let's see if we can decipher this. The website doesn't have much, but we can probably setup some goals of our own. 

1. Let's study overall item popularity throughout the year (aggregate number of items throughout the year)
2. We can study popularity of items based on morning, afternoon and evening. 
3. We can study the popularity of items by splitting dates into seasons. 
4. We can also study which combination of items were the most popular 


# 1. Let's study overall item popularity

This is just a simple count the number of times each items were bought for the entire time span. 

In [None]:
# Let's study the unique items first (top 10)
plt.figure(figsize=(10,10))
item_count = df['Item'].value_counts()
item_count[:10].plot(kind='bar')
plt.show()

Let's make this into a pie-chart to better visualize the proportion of items bought 


In [None]:
# Pi-chart 
labels = item_count[:10].index.tolist()
fig = plt.figure(figsize=[10, 10])
ax = fig.add_subplot(111)
ax.pie(item_count[:10],labels=labels,autopct='%1.1f%%')
plt.axis('equal')
plt.show()

Looks like the coffee (35.5%), followed by Bread (21.6%) and Tea (9.3%).

# Categorize items based on time of day

We need to define what is morning, afternoon and evening based on specific times of day. 

6-11:59 AM: morning
12-2:59 PM: afternoon
3-8 PM: evening 

If I can convert everything into a time format, I can easily just use IF logic to categorize. 

In [None]:
datestamp = [datetime.strptime(x, '%Y-%m-%d').date() for x in df['Date']]

In [None]:
timestamp = [datetime.strptime(x,  '%H:%M:%S').time() for x in df['Time']]

In [None]:
def day(hour):
    if hour >= 6 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 15:
        return 'Afternoon'
    else: 
        return 'Evening'

In [None]:
# Extract hour 
time_of_day = [day(x.hour) for x in timestamp]

In [None]:
df['Time of Day'] = time_of_day

In [None]:
df.head()

In [None]:
# Let's group items by time of day and count 
count_by_day = df.groupby(['Time of Day','Item'])['Item'].agg('count')
# Print top 3 for Evening
evening = count_by_day.loc['Evening'].sort_values(ascending=False)[:5]
morning = count_by_day.loc['Morning'].sort_values(ascending=False)[:5]

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,6))
evening.plot(kind='bar',ax=ax1)
morning.plot(kind='bar',ax=ax2)
plt.show()

# Let's study the top 5 popular items every season 

12 months, 4 seasons, every season = 3 months
Jan, Feb, March = Winter 
April, May, June = Spring
July, August, September = Summer
October, November, December = Fall

In [None]:
# First let's extract the month numbers 
month_num = [date.month for date in datestamp]

In [None]:
def month_category(month):
    if month >=1 and month <=3:
        return 'Winter'
    elif month >=4 and month <=6:
        return 'Spring'
    elif month >=7 and month<=9:
        return 'Summer'
    else:
        return 'Fall'

In [None]:
seasons = [month_category(month) for month in month_num]
df['Seasons'] = seasons

In [None]:
df.tail()

In [None]:
popular_season = df.groupby(['Seasons','Item'])['Item'].agg('count')

In [None]:
# Winter
Fall = popular_season.loc['Fall'].sort_values(ascending=False)[:5]
Spring = popular_season.loc['Spring'].sort_values(ascending=False)[:5]
Winter=popular_season.loc['Winter'].sort_values(ascending=False)[:5]
#print(popular_season.loc['Summer'].sort_values(ascending=False)[:5])

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3,figsize=(14,6))
Fall.plot(kind='bar',ax=ax1)
Spring.plot(kind='bar',ax=ax2)
Winter.plot(kind='bar',ax=ax3)
plt.show()

It looks like coffee is always the popular item but not so popular in the spring and very popular in the winter. Not much traffic during spring and no transactions in the summer. Maybe it's closed during the summer months? Wow, this cafe does super well during Fall and Winter months. 

Next Objectives:
- Let's make these results a little more visual 
- We also can study the combination of items which are popular. This can be done by studying the transaction number. 

# Let's study combination of items 

What am I trying to figure out with this combination analysis?
- which items are popular when paired together 

Simple idea: Since coffee is the most popular item, let's take that, let's group by transaction and take all the transactions that have coffee, and study the most popular item. 

In [None]:
def coffee_ext(group):
    match = group['Item'].str.contains('Coffee')
    return df.loc[match]

# Let's get the transaction numbers of all the transactions that have coffee.
coffee = df[df['Item'].str.contains('Coffee')]['Transaction'].unique()

In [None]:
# Now that we have all the coffee transactions, we can do a left join with coffee
coffee = pd.DataFrame(coffee,columns=['Transaction'])
coffee_m=coffee.merge(df, left_on='Transaction',right_on='Transaction',how='right')
# Remove all the coffee rows, groupby transaction and tally up the items
coffee_m = coffee_m[~coffee_m.Item.str.contains('Coffee')]['Item'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
coffee_m[:5].plot(kind='bar')
plt.show()

It looks like bread is the most popularily paired item with coffee, followed by tea weirdly. 