In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Welcome to the Superstore Sales Dataset analysis

# Data importing, cleaning, preparation

#### Importing the data

In [None]:
#lets import the dataset and review the first five rows
df = pd.read_csv('../input/sales-forecasting/train.csv')
df.head()

# Reviewing the dataset using "describe"

In [None]:
#Lets get some idea what inside the dataset. After running below we see that it has
#some unique values that would be interetsing for further exploration

df.describe(include='all')

# Checking for NaN values and replacing them with missing values

In [None]:
#lets print out how many values in each column are NaN

print(df.isnull().sum())

In [None]:
#here we see that some postal codes are missing and we need to revise it. 
#to revise we first need to see which rows have NaN in the postal code column

df[df['Postal Code'].isnull()]

In [None]:
#its clear (and easy) that one and the same city is missing the postal code
#this means that we just replace all missing values with a needed value
#lets get the postal code for Burlington Vermont
df['Postal Code'] = df['Postal Code'].fillna(5401)

In [None]:
#lets check if we have the full dataset without missing values now
print(df.isnull().sum())

# Analysis

# Sales by sub-category and plotting

In [None]:
#how much each sub-category brought in revenue, rounded to 1 decimal point
dfsub = df.groupby(["Sub-Category"]).sum().sort_values("Sales", ascending=False).head(20) 
dfsub = dfsub[["Sales"]].round(1) 
dfsub.reset_index(inplace=True) 
dfsub

In [None]:
#plotting the sales by sub-category

plt.figure(figsize = (15,7)) # width and height of figure is defined in inches
plt.title("Sub-categories that generated the highest revenue", fontsize=18) 
plt.bar(dfsub["Sub-Category"], dfsub["Sales"],color= '#227d3d',edgecolor='yellow', linewidth = 1)
plt.xlabel("Sub-Category",fontsize=15) # x axis shows the States
plt.ylabel("Sales",fontsize=15) # y axis shows the Revenue
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=12)
for k,v in dfsub["Sales"].items(): #To show the exact revenue generated on the figure
    if v>300000:
        plt.text(k,v-120000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');
    else:
        plt.text(k,v+15000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');

# Pivot tables and plotting

In [None]:
dfplot = df[["Category","Sub-Category","Region","Sales"]]
dfplot

In [None]:
#Sales by Category in 4 regions
pd.pivot_table(dfplot , values = 'Sales' , index = 'Category' , columns = 'Region' , aggfunc = 'sum')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#let's plot some graphs for better undertsanding and visualization
#some ready palettes to use: "magna", "mako", "rocket", "crest", "viridis"
sns.countplot(data = df , x = 'Category' , hue = 'Region'  , palette = 'viridis')

In [None]:
#Sales by Sub-Category in 4 regions
dfpiv = pd.pivot_table(dfplot , values = 'Sales' , index = 'Sub-Category' , columns = 'Region' , aggfunc = 'count')

In [None]:
dfpiv.head()

# Sales by state and plotting

In [None]:
#lets review sales by state, on the second line we are rounding to 1 decimal point
#states sales are listed in the descending order

dfstate = df.groupby(["State"]).sum().sort_values("Sales", ascending=False).head(20) 
dfstate = dfstate[["Sales"]].round(1) 
dfstate.reset_index(inplace=True) 
dfstate

In [None]:
#plotting the sales by state table

plt.figure(figsize = (15,7)) # width and height of figure is defined in inches
plt.title("States which generated the highest revenue", fontsize=18) 
plt.bar(dfstate["State"], dfstate["Sales"],color= '#227d3d',edgecolor='yellow', linewidth = 1)
plt.xlabel("States",fontsize=15) # x axis shows the States
plt.ylabel("Revenue",fontsize=15) # y axis shows the Revenue
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=12)
for k,v in dfstate["Sales"].items(): #To show the exact revenue generated on the figure
    if v>400000:
        plt.text(k,v-120000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');
    else:
        plt.text(k,v+15000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');

#### Sales by Category and plotting (Donut chart with hovering)

In [None]:
#lets review sales by Category, on the second line we are rounding to 1 decimal point
#states sales are listed in the descending order

dfcat = df.groupby(["Category"]).sum().sort_values("Sales", ascending=False).head() 
dfcat = dfcat[["Sales"]].round(1) 
dfcat.reset_index(inplace=True) 
dfcat

In [None]:
import plotly.express as px
fig = px.pie(dfcat, values='Sales', title='Sales by Category', hole=.3, hover_data=['Category'], labels={'Category'})
fig.show()

In [None]:
#Very cool chart below from https://www.kaggle.com/rohitsahoo/eda-superstore-dataset?scriptVersionId=42568767&cellId=36 


# Sort both category and  sub category as per the sales
Top_subcat = df.groupby(['Category','Sub-Category']).sum().sort_values("Sales", ascending=False).head(10)
Top_subcat = Top_subcat[["Sales"]].astype(int) # Cast Sales column to integer data type
Top_subcat = Top_subcat.sort_values("Category") # Sort the values as per Category
Top_subcat.reset_index(inplace=True) # Since we have used groupby, we will have to reset the index to add both columns into data frame
Top_subcat_1 = Top_subcat.groupby(['Category']).sum() # Calculated the total Sales of all the categories
Top_subcat_1.reset_index(inplace=True) # Reset the index

def autopct_format(values): 
    def my_format(pct): 
        total = sum(values) 
        val = int(round(pct*total/100.0))
        return ' ${v:d}'.format(v=val)
    return my_format


plt.rcParams["figure.figsize"] = (15,10) # width and height of figure is defined in inches
fig, ax = plt.subplots()
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle
width = 0.1
outer_colors = ['#FE840E','#009B77','#BC243C'] # Outer colors of the pie chart
inner_colors = ['Orangered','tomato','coral',"darkturquoise","mediumturquoise","paleturquoise","lightpink","pink","hotpink","deeppink"] # inner colors of the pie chart
pie = ax.pie(Top_subcat_1['Sales'], radius=1, labels=Top_subcat_1['Category'],colors=outer_colors,wedgeprops=dict(edgecolor='w'))
pie2 = ax.pie(Top_subcat['Sales'], radius=1-width, labels=Top_subcat['Sub-Category'],autopct=autopct_format(Top_subcat['Sales']),labeldistance=0.7,colors=inner_colors,wedgeprops=dict(edgecolor='w'), pctdistance=0.53,rotatelabels =True)
# Rotate fractions
# [0] = wedges, [1] = labels, [2] = fractions
fraction_text_list = pie2[2]
for text in fraction_text_list: 
    text.set_rotation(315) # rotate the autopct values
centre_circle = plt.Circle((0,0),0.6,fc='white') # Draw a circle on the pie chart
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.show()