In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
data=pd.read_csv('../input/sample-supermarket-dataset/SampleSuperstore.csv')

# introduction
* The sample superstore dataset consists data ranging from the type of product sold, where it is sold to and how they are shipped.
* First lets look at the data and understand and its types and information it contains
* Then we will make some vizualizations to compare all the columns and use interactive tools using widgets to look at different parameters. Using seaborn, we can visualize the distribution of the profits and loss incurred by different sub categories and how each of them compare to each other. There are many different column category combinations we can try to detemine areas of loss and profits.

In [None]:
data.info()

In [None]:
# lets check if there are any duplicates in the data, if there are, we'll remove them.
data=data.drop(data[data.duplicated()].index)
data

* assuming that no two orders can be exactly the same, we ignore the duplicates. But it is entirely possible that two orders can be same as we cannot distinguish them by order time data or order_id as this information is not provided. In our case lets assume no two orders are the same and continue.

In [None]:
data.Country.value_counts()

In [None]:
# display only top 15 cities based on their frequency
data.City.value_counts().head(15)

In [None]:
print(f'The total number of unique city locations in the dataset is {len(data.City.value_counts())}')

In [None]:
data.State.value_counts().head(15)

In [None]:
print(f'The total number of unique state locations in the dataset is {len(data.State.value_counts())}')

* We can use the crosstab function to group data by different categories and apply summary statistics on those grouped data. The variables to use in the contigency table are the categorical variables and we can use sum or mean to calculate summary characteristics on numerical values of the grouped counts where numerical columns are 'Quantity', 'Profits' etc

In [None]:
from ipywidgets import widgets,interact
drop_down_values=widgets.Dropdown(options=['Profit','Discount','Sales','Quantity'],value='Quantity')
drop_down_agg=widgets.Dropdown(options=['sum','mean','std'])
drop_down_var1=widgets.Dropdown(options=list(data.describe(include='object').columns),value='City')
drop_down_var2=widgets.Dropdown(options=list(data.describe(include='object').columns),value='Sub-Category')


def crosstab(var1,var2,values,agg):
    return pd.crosstab(data[var1],data[var2],margins=True,values=data[values],aggfunc=agg)

interact(crosstab,var1=drop_down_var1,var2=drop_down_var2,values=drop_down_values,agg=drop_down_agg)

* One of uses of the crosstab are the 'Sub-Category' and 'City' combination. We can detemine which city ordered which product and what was mean or total profit from that city. This will help us detemine which cities to target to increase profits or reduce losses by discontinuing the sale of the product in that city. 
* We can also determine which city ordered what type of product. The NaNs indicate that, that city did not order that product.


# Vizualizations

In [None]:
# lets look at the value_counts() of each column present and get an overall idea
fig,ax=plt.subplots(2,3,figsize=(16,8))
ax[0][0].bar(data['Ship Mode'].value_counts().index,data['Ship Mode'].value_counts().values,color='#ea4335')
ax[0][0].set_title('$Ship Mode$',fontweight='bold')

ax[0][1].bar(data['Segment'].value_counts().index,data['Segment'].value_counts().values,color='#4285f4')
ax[0][1].set_title('$Segment$',fontweight='bold')

ax[0][2].bar(data['Region'].value_counts().index,data['Region'].value_counts().values,color='#fbbc05')
ax[0][2].set_title('$Region$',fontweight='bold')

ax[1][0].bar(data['Category'].value_counts().index,data['Category'].value_counts().values,color='#34a853')
ax[1][0].set_title('$Category$',fontweight='bold')

ax[1][1].barh(data['Sub-Category'].value_counts().index,data['Sub-Category'].value_counts().values,color='#5F68C3')
ax[1][1].set_title('$Sub-Category$',fontweight='bold')

ax[1][2].bar(data['Quantity'].value_counts().index,data['Quantity'].value_counts().values,color='#747474')
ax[1][2].set_title('$Quantity$',fontweight='bold')

plt.sca(ax[0][0])
plt.xticks(rotation=10)

plt.show()

If you are wondering why I used specific colors for individual plots,.....well I was bored and wanted to try something different. Interestingly enough, the colors of the first 4 subplots from the left are official colors of google, the 5th one is of TCS and the last one is one of the colors from the microdoft logo 
* I got them from here https://www.schemecolor.com/tata-consultancy-services-tcs-logo-color.php, it has the color codes for the other companies too!
* From the countplots, we can see some of the distribution of the different categories and how they compare to themsleves.
* Lets look at the different locations preset in our dataset.

In [None]:
# lets compare each sub-categories Sales, Profits, Quantity and discount per order by plotting bar graphs
fig,ax=plt.subplots(2,2,figsize=(18,8),sharex=True)
labels=data.groupby(by='Sub-Category').agg('sum').index
sns.barplot(x=data.groupby(by='Sub-Category').agg('sum').index,y=data.groupby(by='Sub-Category').agg('sum').Profit,order=labels,ax=ax[0][0])
sns.barplot(x=data.groupby(by='Sub-Category').agg('sum').index,y=data.groupby(by='Sub-Category').agg('sum').Quantity,order=labels,ax=ax[1][0])
sns.barplot(x=data.groupby(by='Sub-Category').agg('sum').index,y=data.groupby(by='Sub-Category').agg('sum').Sales,order=labels,ax=ax[0][1])
sns.barplot(x=data.groupby(by='Sub-Category').agg('sum').index,y=data.groupby(by='Sub-Category').agg('mean').Discount,order=labels,ax=ax[1][1])
plt.sca(ax[1][1])
plt.ylabel('Average discount per order')
plt.xticks(rotation=60)
plt.sca(ax[1][0])
plt.xticks(rotation=60)
plt.show()

* Above we can compare the different sub_categories with each other. We can determine which sub_category is helping the superstore and which category is causing loss.
* lets look at the categories of products sold by region. We group the dataset by Region and Category and aggregate by two parameters, either by 'sum' so as to obtain total stats for each region and category or we use 'mean' parameter to obtain 'per order' statistics. Using the visualizations below, we can compare the types of products sold by region and their profits or sales.
* Using the ipywidgets module, we can look at the visualizations one at a time and note down important takeaways or observaions
* The main comparision here is the region where the sale occurred or where the shipment is off to, so we can compare the performance by region whereas in the previous visualizations, we compared the subcategories.

In [None]:
drop_down=widgets.Dropdown(options=['Sales','Discount','Quantity','Profit'],value='Profit',disabled=False)
drop_down1=widgets.Dropdown(options=['sum','mean'],value='sum',disabled=False)

def data_new(agg_parameter,variable):
    new_data=data.groupby(by=['Region','Category']).agg(agg_parameter)
    hue=[]
    for i in range(len(new_data)):
        hue.append(new_data.index[i][1])
    region=[]
    for i in range(len(new_data)):
        region.append(new_data.index[i][0])
    new_data['region']=region
    new_data['hue']=hue
    
    graph=sns.barplot(data=new_data,y=variable,x='region',hue='hue',palette='viridis')
    graph,plt.legend(bbox_to_anchor=(1,1.02))


interact(data_new,agg_parameter=drop_down1,variable=drop_down)
# sum and Discount gives cumulative discount of all the items ordered (Category like Furniture etc)
# mean gives 'per order' stat as we aggregated by 'mean'

* Lets look at the disrtibution of sales data according to the sub categories by using the interactive widget to select the category. 

In [None]:
Furniture=list(data[data.Category=='Furniture']['Sub-Category'].unique())
Office_Supplies=list(data[data.Category=='Office Supplies']['Sub-Category'].unique())
Technology=list(data[data.Category=='Technology']['Sub-Category'].unique())


from ipywidgets import widgets,interact
drop_down=widgets.Dropdown(description='Sub Category',
                           options=[('furniture',Furniture),('office supplies',Office_Supplies),('technology',Technology)])
# creating a dictionary of the sub-categories and their maximum sales values
max_sales={}
sub=data['Sub-Category'].unique()
for i in sub:
    max_sales[i]=data[data['Sub-Category']==i].Sales.max()
    
def graph(sub_category):
    fig,ax=plt.subplots(1,1,figsize=(16,8))
    sns.kdeplot(data=data[data['Sub-Category'].isin(sub_category)],x='Sales',hue='Sub-Category',ax=ax)
    n=0.0002
    for i in sub_category:
        ax.annotate(f'max {i} Sales ({max_sales[i]})',xy=(max_sales[i],0),xytext=(5000,n),arrowprops=dict(shrink=0.05))
        n+=0.0001
    
interact(graph,sub_category=drop_down)

* The distribution shows the sales on a per order basis, i.e. the maximum sales value for each sub-category shows the maximum sales out of all the orders.

In [None]:
drop_down=widgets.Dropdown(options=list(data['Sub-Category'].unique()))

def graph(sub_category):
    sns.histplot(data[data['Sub-Category']==sub_category].Profit/data[data['Sub-Category']==sub_category].Quantity)

interact(graph,sub_category=drop_down)

* Dividing the profits of each order to the quantity of the product ordered gives us the unit profit made from the Product. Here we can see that each category, for example chairs, dont have the same unit profit, i.e. each chair order has different profit per unit alluding to the fact that there are probably different types of chairs. Hence the superstore should try to push sales of a type of sub-category (chair in our example) which gives high profits per unit.
* From this distribution, we can also see which category is profitable.

* Now lets look at the distribution of profit and loss among the sub-categories

In [None]:
#dividing the data into those orders that resulted in profits and those which resulted in a loss
profit=data[data.Profit>0]
loss=data[data.Profit<0]

# percentage share of total profit by each sub-category
plt.pie(profit.groupby('Sub-Category').agg('sum').Profit,radius=3.12,labels=profit.groupby('Sub-Category').agg('sum').index,
       autopct='%1.2f%%')
plt.title('Profit pie',fontdict=dict(fontsize=36),pad=100,loc='center')
plt.show()

# percentage share of total loss by each sub-category
plt.pie(np.abs(loss.groupby('Sub-Category').agg('sum').Profit),radius=3.12,labels=loss.groupby('Sub-Category').agg('sum').index,
       autopct='%1.2f%%')
plt.title('Loss pie',fontdict=dict(fontsize=36),pad=100,loc='center')
plt.show()

* From the pie chart, we can immediately see which sub-category takes majority portion of the total profit and loss. From the Profit pie, we can see that 'Binders' account for 15.54% of the total profits and 'Copiers' having the second best share of about 12.5%
* The loss pie shows us that not only does binders account for the majority of the profits, but also account for almost 25% of the total loss. 'Tables' aslo cause quiet a bit of damage as they account for 20% of total loss.
* We can see that not all sub-categories are included in the loss pie as some only make profits and dont account for the loss. For eg, 'Copiers' only make profits i.e. no loss is incurred by the sale of copiers.