In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_holiday_events = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')
df_oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
df_sample = pd.read_csv('../input/store-sales-time-series-forecasting/sample_submission.csv')
df_stores = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')
df_train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
df_transactions = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv')

In [None]:
df_transactions.plot(x='date', y='transactions',figsize=(15,5))

In [None]:
df_trans_high_sales = df_transactions.query('transactions > 8000')

In [None]:
# highest volume of sales appears to be around Christmas (Navidad)
df_trans_high_sales

In [None]:
df_trans_high_sales.max()

In [None]:
# total sales for each store
df_store_totals = df_transactions.groupby(by="store_nbr").sum()
df_store_totals.info

In [None]:
# get the total transactions of a specific store
# both return Store # 25
print(df_store_totals.iat[24,0]) # using the index of the row (starts with 0)
print(df_store_totals.at[25, 'transactions']) # using the store number value

In [None]:
# get the store by the index name
df_store_totals.loc[25]

In [None]:
# return a dataframe of specific rows
df_store_totals.loc[[1,2,3,25]]

In [None]:
# get stores based on performance level
df_store_totals.loc[df_store_totals['transactions']<1000000]

In [None]:
df_store_totals.sum()

In [None]:
# get each transaction of a certain store
df_store1 = df_transactions.query('store_nbr == 1')
#df_store1.groupby(level=0)
df_store1

In [None]:
df_transactions['store_nbr'].max()

# Notes on concatenation
To place dataframes "side by side":
* pd.concat([df1, df2], axis=1)

To place dataframes "one underneath the other":
* pd.concat([df1, df2], axis=0)

In [None]:
df_train.loc[df_train['sales'] > 0]

In [None]:
df_stores

In [None]:
df_store_totals.reset_index()

In [None]:
# here we can see that df_stores index is from 0 to 53,
# but df_store_totals index is 1 to 54
print(df_stores.info())
print(df_store_totals.info())

In [None]:
# in order to concat stores and transactions, we must reset
# the index so that the stores line up properly. otherwise
# the data will be off by one row
# run reset_index() on df_store_totals to make the data line up properly
df_store_totals = df_store_totals.reset_index()

In [None]:
# both of these df's have the same column 'store_nbr'. We only want one of 
# those columns, so concat only 'transactions' from df_store_totals to avoid
# duplicate columns
df_stores_and_trans = pd.concat([df_stores, df_store_totals['transactions']], axis=1)

In [None]:
# now we can see that Store 1 correctly has 2553963 transactions,
# and only one column for store_nbr
df_stores_and_trans.head()

In [None]:
# what can we learn from the 'cluster'?
df_stores_and_trans['cluster'].value_counts()

In [None]:
# which clusters performed the best?
df_stores_and_trans.groupby(['cluster']).sum()

In [None]:
# set the index to 'store_nbr' so that index[x] will
# refer to store_nbr[x]
df_stores_and_trans.set_index('store_nbr')

In [None]:
# what do the different "types" look like?
df_stores_and_trans['type'].value_counts()

In [None]:
# the cluster with the highest transactions
# all type A
df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 14]

In [None]:
df_type_a = df_stores_and_trans.loc[df_stores_and_trans['type'] == 'A']
df_type_b = df_stores_and_trans.loc[df_stores_and_trans['type'] == 'B']
df_type_c = df_stores_and_trans.loc[df_stores_and_trans['type'] == 'C']
df_type_d = df_stores_and_trans.loc[df_stores_and_trans['type'] == 'D']
df_type_e = df_stores_and_trans.loc[df_stores_and_trans['type'] == 'E']

In [None]:
df_type_a

In [None]:
df_type_a_sum = df_type_a['transactions'].sum()
df_type_b_sum = df_type_b['transactions'].sum()
df_type_c_sum = df_type_c['transactions'].sum()
df_type_d_sum = df_type_d['transactions'].sum()
df_type_e_sum = df_type_e['transactions'].sum()

In [None]:
# Make a new DataFrame with the total transactions for each Store Type
df_type_data = [('A', df_type_a_sum),('B', df_type_b_sum),('C', df_type_c_sum),('D', df_type_d_sum),('E', df_type_e_sum),]
df_type_trans = pd.DataFrame(df_type_data, columns=['Store Type', 'Transactions'])

In [None]:
df_type_trans

In [None]:
df_type_trans = df_type_trans.set_index('Store Type')

In [None]:
df_type_trans.plot(kind='bar', y='Transactions', figsize=(15,5))

In [None]:
df_type_a['transactions'].describe()

In [None]:
df_type_b['transactions'].describe()

In [None]:
df_type_c['transactions'].describe()

In [None]:
df_type_d['transactions'].describe()

In [None]:
df_type_e['transactions'].describe()

In [None]:
df_stores_and_trans['cluster'].value_counts()

In [None]:
df_cluster_1 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 1]
df_cluster_2 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 2]
df_cluster_3 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 3]
df_cluster_4 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 4]
df_cluster_5 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 5]
df_cluster_6 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 6]
df_cluster_7 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 7]
df_cluster_8 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 8]
df_cluster_9 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 9]
df_cluster_10 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 10]
df_cluster_11 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 11]
df_cluster_12 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 12]
df_cluster_13 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 13]
df_cluster_14 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 14]
df_cluster_15 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 15]
df_cluster_16 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 16]
df_cluster_17 = df_stores_and_trans.loc[df_stores_and_trans['cluster'] == 17]


In [None]:
df_cluster_1_sum = df_cluster_1['transactions'].sum()
df_cluster_2_sum = df_cluster_2['transactions'].sum()
df_cluster_3_sum = df_cluster_3['transactions'].sum()
df_cluster_4_sum = df_cluster_4['transactions'].sum()
df_cluster_5_sum = df_cluster_5['transactions'].sum()
df_cluster_6_sum = df_cluster_6['transactions'].sum()
df_cluster_7_sum = df_cluster_7['transactions'].sum()
df_cluster_8_sum = df_cluster_8['transactions'].sum()
df_cluster_9_sum = df_cluster_9['transactions'].sum()
df_cluster_10_sum = df_cluster_10['transactions'].sum()
df_cluster_11_sum = df_cluster_11['transactions'].sum()
df_cluster_12_sum = df_cluster_12['transactions'].sum()
df_cluster_13_sum = df_cluster_13['transactions'].sum()
df_cluster_14_sum = df_cluster_14['transactions'].sum()
df_cluster_15_sum = df_cluster_15['transactions'].sum()
df_cluster_16_sum = df_cluster_16['transactions'].sum()
df_cluster_17_sum = df_cluster_17['transactions'].sum()

In [None]:
# Make a new DataFrame with the total transactions for each Store Cluster
df_cluster_data = [('1', df_cluster_1_sum),('2', df_cluster_2_sum),('3', df_cluster_3_sum),
               ('4', df_cluster_4_sum),('5', df_cluster_5_sum),('6', df_cluster_6_sum),
               ('7', df_cluster_7_sum),('8', df_cluster_8_sum),('9', df_cluster_9_sum),
               ('10', df_cluster_10_sum),('11', df_cluster_11_sum),('12', df_cluster_12_sum),
               ('13', df_cluster_13_sum),('14', df_cluster_14_sum),('15', df_cluster_15_sum),
               ('16', df_cluster_16_sum),('17', df_cluster_17_sum),]
df_cluster_trans = pd.DataFrame(df_cluster_data, columns=['store_cluster', 'transactions'])

In [None]:
df_cluster_trans

In [None]:
df_cluster_10

In [None]:
df_cluster_trans.describe()

In [None]:
df_cluster_trans.plot(kind='bar', y='transactions', figsize=(15,5))

# Dates of Transactions

One method:

mask = (df_transactions['date'] >= '2013-01-01') & (df_transactions['date'] <= '2013-12-31')

df_transactions.loc[mask]

Another way:

df_transactions = df_transactions.set_index('date')

df_transactions['2013-01-01':'2013-12-31']


In [None]:
df_transactions = df_transactions.set_index('date')

In [None]:
df_trans_2013 = df_transactions['2013-01-01':'2013-12-31']
df_trans_2014 = df_transactions['2014-01-01':'2014-12-31']
df_trans_2015 = df_transactions['2015-01-01':'2015-12-31']
df_trans_2016 = df_transactions['2016-01-01':'2016-12-31']
df_trans_2017 = df_transactions['2017-01-01':'2017-12-31']

In [None]:
df_trans_2013_sum = df_trans_2013['transactions'].sum()
df_trans_2014_sum = df_trans_2014['transactions'].sum()
df_trans_2015_sum = df_trans_2015['transactions'].sum()
df_trans_2016_sum = df_trans_2016['transactions'].sum()
df_trans_2017_sum = df_trans_2017['transactions'].sum()

In [None]:
# Make a new DataFrame with the total transactions for each year
df_year_data = [('2013', df_trans_2013_sum),('2014', df_trans_2014_sum),('2015', df_trans_2015_sum),
                  ('2016', df_trans_2016_sum),('2017', df_trans_2017_sum),]
df_trans_yearly = pd.DataFrame(df_year_data, columns=['year', 'transactions'])

In [None]:
df_trans_yearly

In [None]:
df_trans_yearly = df_trans_yearly.set_index('year')

In [None]:
df_trans_yearly.plot(kind='bar', y='transactions', figsize=(15,5))

# Some Insights

* By analyzing the Store Type we see that although Type D appears to be most successful in terms of having the most total transactions, the reason for that is that there are more stores of Type D than any other Type.  Type A has half as many stores as Type D, but almost as many total transactions.  On closer inspection, Type A stores have a mean of 4.801719e+06 transactions, which is much higher than Type D at a mean of 2.568338e+06 transactions.

* Sales for 2017 dropped significantly compared to previous years


# To Do:
* Figure out how to work with the Dates of the Transactions
* Explore the relationships between Transactions, Store Types and Store Clusters
* Discover which Holidays and Events drive more Transactions
* Integrate the Oil data
* Build a ML model to predict transactions in the future