In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#install the version of seaborn this needs to work
!pip install seaborn==0.11.0

In [None]:
#get some packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import datetime

#import pkg_resources
#pkg_resources.require("seaborn==0.11.0")
import seaborn as sns

In [None]:
#import the data
df= pd.read_csv('/kaggle/input/retaildataset/sales data-set.csv')
df.head()

In [None]:
#create some derived fields (because this dataset only has sales we are dummying up returns randomly)
df.insert(loc=2, column="Return_Rate", value=np.random.rand(421570,1))
df['Weekly_Returns'] = df['Weekly_Sales']*df['Return_Rate']
df.head()

In [None]:
#select rows based on value in a column and look at datatypes of each field 
df_subset = df[df['Return_Rate']<=.25 ]
df_subset.info()

In [None]:
#change datatype of a field from int to string 
df_subset['Store']=df_subset['Store'].astype(str)
df_subset.info()

In [None]:
#sort by multiple columns
df_subset.sort_values(by=['Store'] + ['Dept'] + ['Date'], ascending=True).head()


In [None]:
#format decimal places
df_subset['Weekly_Returns']=round(df_subset['Weekly_Returns'],2)
df_subset.head()

In [None]:
#select only specific columns into a new table
df_sliced = df_subset.filter(items=['Weekly_Sales', 'Weekly_Returns'])
df_sliced.head()

In [None]:
#drop specific columns 
df_dropped=df.drop(columns=['IsHoliday', 'Dept'])
df_dropped.head()

In [None]:
#frequency distribution (first ten rows)
df_freq = pd.DataFrame(df_subset.groupby(
     ['Store' , 'Dept']
 ).agg(
     Num_Weeks = ('Date','count')
    
).reset_index()
                      )

df_freq.sort_values(by='Num_Weeks', ascending=False).head(10)

In [None]:
#univariate
def univariate(pct_desc, pct, field):
    print(pct_desc+": "+field.quantile(pct).astype(str)) 

univariate('Max', 1, df_subset['Weekly_Sales'])
univariate('99th Percentile', .99, df_subset['Weekly_Sales'])
univariate('95th Percentile', .95, df_subset['Weekly_Sales'])
univariate('90th Percentile', .90, df_subset['Weekly_Sales'])
univariate('75th Percentile', .75, df_subset['Weekly_Sales'])
univariate('50th Percentile', .50, df_subset['Weekly_Sales'])
univariate('25th Percentile', .25, df_subset['Weekly_Sales'])
univariate('10th Percentile', .10, df_subset['Weekly_Sales'])
univariate('5th Percentile', .05, df_subset['Weekly_Sales'])
univariate('Min', .00, df_subset['Weekly_Sales'])

In [None]:
#histogram
f, ax = plt.subplots(figsize=(18, 7))
ax=sns.set_style('darkgrid')
ax=sns.distplot(df_subset['Weekly_Sales'])

In [None]:
#boxplot (single variable)
sns.boxplot(y="Weekly_Sales", data=df_subset)

In [None]:
#remove sales outliers (past 99th percentile) and create scatterplot
df_no_outliers = df_subset[df_subset['Weekly_Sales']<=df_subset['Weekly_Sales'].quantile(.99)]
sns.scatterplot(data=df_no_outliers
                , x="Weekly_Sales", y="Weekly_Returns")

In [None]:
#combine histograms and scatterplot and display the R2
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

sns.jointplot(df_no_outliers['Weekly_Sales'], df_no_outliers['Weekly_Returns'], kind="reg")
print("The Pearson R is: "+round(r2(df_no_outliers['Weekly_Sales']
                                    , df_no_outliers['Weekly_Returns']),2).astype(str))

In [None]:
#Derive the year from the date field
df_no_outliers['Year'] = pd.DatetimeIndex(df_no_outliers['Date']).year.astype(str)
df_no_outliers.head()

In [None]:
#lets look at the min and max year
print(min(df_no_outliers['Year']))
print(max(df_no_outliers['Year']))

In [None]:
#Sum the Sales and Returns for each year

pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

#groupby year and summarize sales and returns
df_agg = pd.DataFrame(df_no_outliers.groupby(
     ['Year']
 ).agg(
     Total_Sales = ('Weekly_Sales','sum'),
    Total_Returns = ('Weekly_Returns', 'sum')
).reset_index()
                      )
#create a LOG value
df_agg['LOG Returns']=np.log2(df_agg['Total_Returns']).astype(int) 
df_agg['LOG Sales']=np.log2(df_agg['Total_Sales']).astype(int)

df_agg['Returns in M$']=(df_agg['Total_Returns']/1000000).astype(int)
df_agg['Sales in M$']=(df_agg['Total_Sales']/1000000).astype(int)

#show the min/max for the xlim on charting
print(max(df_agg['Total_Returns']))
print(max(df_agg['Total_Sales']))

df_agg


In [None]:
#Plot the sales & returns on a bar

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 4))


# Plot the total sales
sns.set_color_codes("pastel")
sns.barplot(x="Sales in M$", y="Year", data=df_agg,
            label="Total Sales", color="b")

# Plot the total returns
sns.set_color_codes("muted")
sns.barplot(x="Returns in M$", y="Year", data=df_agg,
            label="Total Returns", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 555), ylabel="",
       xlabel="Returns vs. Sales (In Millions)")
sns.despine(left=True, bottom=True)