In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from plotly.subplots import make_subplots #Visualizations
import plotly.graph_objects as go #visualizations

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
#read CSV and check
df=pd.read_csv('../input/retail-store-sales-transactions/scanner_data.csv')
df.head()
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Clean it up, check it out for clarity

In [None]:
df.duplicated().sum() #finds duplicayet rows

In [None]:
df.isnull().sum() #Finds any null values

### Nice, no dupes or missing values

### Lets drop 'Unnamed: 0' columns and go into checking data types

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

In [None]:
df.dtypes

### Lets turn Date column into DateTime

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

### Nice

## The data we recieved states this is 1 year of data, lets check that

In [None]:
df.sort_values(by=['Date'])

### Indeed there is 1 year of data, beginning on 2017-01-02 and ending 2018-01-01 <br> A full Calendar year

In [None]:
print(df['Date'].nunique())

### Here we see that there are only 363 dates recorded <br>
We are missing 2 days, what are they?

In [None]:
pd.date_range(start = '2017-01-02', end = '2018-01-01' ).difference(df['Date'])

### Those missing dates are 2017-03-28 & 2017-12-26

# Let us begin to gather Sales Data
## We will then order subplots over different periods to view Sales Amounts
## Sales by Quarter, Month, Week, and day
- Both Sales Amount and Individual Transactions

In [None]:
#Create new Dataframes containing individual breakdowns of time period and sales amount
sales_time = df.sort_values('Date').copy()
sales_time['Quarter']= df.Date.dt.quarter
sales_time['Month']= df.Date.dt.month
sales_time['Week']= df.Date.dt.isocalendar().week
sales_time['Day of Week']= df.Date.dt.dayofweek #Monday = 0

In [None]:
sales_time.head()

# Set Data for Sales Amount Tables

In [None]:
sales_by_quarter= sales_time.groupby(['Quarter']).agg({'Sales_Amount':'sum'}).reset_index()
sales_by_Month= sales_time.groupby(['Month']).agg({'Sales_Amount':'sum'}).reset_index()
sales_by_week= sales_time.groupby(['Week']).agg({'Sales_Amount':'sum'}).reset_index()
sales_by_weekday= sales_time.groupby(['Day of Week']).agg({'Sales_Amount':'sum'}).reset_index()

## Using Plotly, we will create a subplot view of sales amount over these time frames

In [None]:
fig=make_subplots(rows=2,cols=2, subplot_titles=('Sales by Quarter', 'Sales by Month', 'Sales By Week', 'Sales By Day of Week (0=Monday)'))
fig.add_trace(go.Bar(x=sales_by_quarter['Quarter'], y=sales_by_quarter['Sales_Amount']), row=1, col=1)
fig.add_trace(go.Bar(x=sales_by_Month['Month'], y=sales_by_Month['Sales_Amount']), row=1, col=2)
fig.add_trace(go.Bar(x=sales_by_week['Week'], y=sales_by_week['Sales_Amount']), row=2, col=1)
fig.add_trace(go.Bar(x=sales_by_weekday['Day of Week'], y=sales_by_week['Sales_Amount']), row=2, col=2)
fig.update_layout(showlegend=False, title_text="Sales Amounts in Currency Over different Periods")

# With the above data we can visualize a quick idea of the spread of sales
## Some insights
1. Mondays are typically low earning days
2. Sales amount rose in the 4th quarter, as supported by rising Sales Amount through weeks

# Lets look into SKU'S 
1. How many Sku's are there?
2. What are the most popular items, by quantity?
3. What item gave the most revenue?

In [None]:
#Groups SKU's amd gives us total sales and quanitity sold for each
sku= df.groupby(['SKU']).agg({'Sales_Amount':'sum', 'Quantity':'sum'}).reset_index()
sku.head()

In [None]:
len(sku)

### Total Sku's = 5242, as we expected from data source

### Top 10 Best selling items this year by quantity

In [None]:
sku.sort_values(['Quantity'], ascending=False).head(10)

### Worst 10 selling items this year by quantity

In [None]:
sku.sort_values(['Sales_Amount']).head(10)

# Lets explore the data with respect to Transactions

In [None]:
transactions=sales_time.sort_values('Transaction_ID')
df.Transaction_ID.nunique()

### Total transactions for the year is 64,682, as expected

### We will create new dataframes similarly to Sales Amount data

In [None]:
transactions_by_quarter= sales_time.groupby(['Quarter']).agg({'Transaction_ID':'nunique'}).reset_index()
transactions_by_Month= sales_time.groupby(['Month']).agg({'Transaction_ID':'nunique'}).reset_index()
transactions_by_week= sales_time.groupby(['Week']).agg({'Transaction_ID':'nunique'}).reset_index()
transactions_by_weekday= sales_time.groupby(['Day of Week']).agg({'Transaction_ID':'nunique'}).reset_index()

### Lets see this information graphed to see when the store is busiest with customers

In [None]:
fig2=make_subplots(rows=2,cols=2, subplot_titles=('Transactions by Quarter', 'Transactions by Month', 'Transactions By Week', 'Transactions By Day of Week (0=Monday)'))
fig2.add_trace(go.Bar(x=transactions_by_quarter['Quarter'], y=transactions_by_quarter['Transaction_ID']), row=1, col=1)
fig2.add_trace(go.Bar(x=transactions_by_Month['Month'], y=transactions_by_Month['Transaction_ID']), row=1, col=2)
fig2.add_trace(go.Bar(x=transactions_by_week['Week'], y=transactions_by_week['Transaction_ID']), row=2, col=1)
fig2.add_trace(go.Bar(x=transactions_by_weekday['Day of Week'], y=transactions_by_week['Transaction_ID']), row=2, col=2)
fig2.update_layout(showlegend=False, title_text="Total Transactions Over different Periods")

## Transactions over time appear very similar to Sales Amount over time. 
### We can add these 2 sets of charts together to better understand the relationship, with transactions as line charts

In [None]:
fig=make_subplots(rows=2,cols=2, subplot_titles=('Quarterly', 'Monthly', 'Weekly','Day of Week 0=Monday'),
                                specs=[[{"secondary_y": True}, {"secondary_y": True}],
                                       [{"secondary_y": True}, {"secondary_y": True}]])

#Quarterly
fig.add_trace(go.Bar(x=sales_by_quarter['Quarter'], y=sales_by_quarter['Sales_Amount'],name='Sale Amount'), row=1, col=1, secondary_y=False)
fig.add_trace(go.Line(x=transactions_by_quarter['Quarter'], y=transactions_by_quarter['Transaction_ID'],name='Transaction Total'), row=1, col=1, secondary_y=True,)
#Monthly
fig.add_trace(go.Bar(x=sales_by_Month['Month'], y=sales_by_Month['Sales_Amount'],name='Sale Amount'), row=1, col=2,secondary_y=False)
fig.add_trace(go.Line(x=transactions_by_Month['Month'], y=transactions_by_Month['Transaction_ID'],name='Transaction Total'), row=1, col=2,secondary_y=True,)
#Weekly
fig.add_trace(go.Bar(x=sales_by_week['Week'], y=sales_by_week['Sales_Amount'],name='Sale Amount'), row=2, col=1,secondary_y=False)
fig.add_trace(go.Line(x=transactions_by_week['Week'], y=transactions_by_week['Transaction_ID'],name='Transaction Total'), row=2, col=1,secondary_y=True,)
#By Day of Week
fig.add_trace(go.Bar(x=sales_by_weekday['Day of Week'], y=sales_by_week['Sales_Amount'],name='Sale Amount'), row=2, col=2,secondary_y=False)
fig.add_trace(go.Line(x=transactions_by_weekday['Day of Week'], y=transactions_by_week['Transaction_ID'],name='Transaction Total'), row=2, col=2,secondary_y=True,)

## We can the relationship between Sales Amount and total transactions in that same period

# Lets breakdown some informative data on the actual SKU products
### First we will isolate SKUs and Quantity and turn it into a pivot table, broken down by week

In [None]:
grouped=sales_time.groupby(['Week', 'SKU']).agg({'Quantity':['sum']})
print(grouped.head())
print(grouped.tail())

### That looks good

### Lets turn this into a pivot table to grab some statistics

In [None]:
pivot=grouped.pivot_table(values='Quantity', index='SKU', columns='Week')
pivot.head()

In [None]:
#Check datatypes have been preserved
pivot.dtypes

### Lets replace those NAN values with zeros

In [None]:
pivot=pivot.replace(np.nan, 0)
pivot.head()

In [None]:
#Must set index to SKU
pivot.reset_index(inplace=True)
#Creates new Dataframe using Index from Pivot
stats=pd.DataFrame(index=pivot.index)
stats['SKU']=pivot['SKU'] #Copies SKUs into column
stats['total_sold']=pivot.sum(axis=1) #Finds total sold of SKU for the year
stats['average']=pivot.mean(axis=1) #Gives average of units sold of SKU for weekly period
stats['std_dev']=pivot.std(axis=1) #Gives the standard deviation of quantity sold in a week
stats

### Lets add in the SKU category back into the

In [None]:
#Check your data matches
stats2=stats.copy()
print(df['Quantity'].sum())
print(stats2['total_sold'].sum())

In [None]:
#Create separate dataframe for SKUs
skuCat=pd.DataFrame()
skuCat['SKU']=df['SKU']
skuCat['SKU_Category']=df['SKU_Category']

In [None]:
#Drop duplicates and sort values to match out stats dataframe
skuCat.drop_duplicates(inplace=True)
skuCat.sort_values('SKU', inplace=True)
stats2.sort_values('SKU', inplace=True)

### Make sure everything matches



In [None]:
print(skuCat)
print(stats2)

### Merge this info using Full outer join back to stats dataframe


In [None]:
stats=pd.merge(stats2, skuCat, how='outer', on='SKU')
stats.head()

### Now we can see what categories these products belong to!

### With this data:
1. We have an over view of trends in sales amount, as well as quantity of products sold
2. We also have an statistics table where we can further investigate which items are popular and how often they sell!
