In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
dataset = pd.read_csv("../Dataset/nike-sales-dataset_v2.csv", parse_dates=["Invoice Date"])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9360 entries, 0 to 9359
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Invoice Date    9360 non-null   datetime64[ns]
 1   Product         9360 non-null   object        
 2   Region          9360 non-null   object        
 3   Retailer        9360 non-null   object        
 4   Sales Method    9360 non-null   object        
 5   State           9360 non-null   object        
 6   Price per Unit  9360 non-null   int64         
 7   Total Sales     9360 non-null   int64         
 8   Units Sold      9360 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 658.3+ KB


In [3]:
# Show dataset
dataset

Unnamed: 0,Invoice Date,Product,Region,Retailer,Sales Method,State,Price per Unit,Total Sales,Units Sold
0,2020-01-01,Men's Street Footwear,Northeast,Foot Locker,In-store,New York,50,6000,120
1,2020-02-01,Men's Athletic Footwear,Northeast,Foot Locker,In-store,New York,50,5000,100
2,2020-03-01,Women's Street Footwear,Northeast,Foot Locker,In-store,New York,40,4000,100
3,2020-04-01,Women's Athletic Footwear,Northeast,Foot Locker,In-store,New York,45,3825,85
4,2020-05-01,Men's Apparel,Northeast,Foot Locker,In-store,New York,60,5400,90
...,...,...,...,...,...,...,...,...,...
9355,2021-06-07,Women's Athletic Footwear,West,West Gear,Outlet,Idaho,38,60,16
9356,2021-05-04,Women's Street Footwear,West,West Gear,Outlet,Idaho,19,31,16
9357,2021-05-05,Women's Street Footwear,West,West Gear,Outlet,Idaho,18,33,18
9358,2021-04-06,Women's Street Footwear,West,West Gear,Outlet,Idaho,34,63,19


Step 1 - Get Unique Values

In [4]:
# Get unique on product
print("Product : "+str(
    np.sort(dataset["Product"].unique())
))

Product : ["Men's Apparel" "Men's Athletic Footwear" "Men's Street Footwear"
 "Women's Apparel" "Women's Athletic Footwear" "Women's Street Footwear"]


In [5]:
# dataset["Product"].unique()

In [6]:
print("State : "+str(
    np.sort(dataset["State"].unique())
))

State : ['Alabama' 'Arizona' 'Arkansas' 'California' 'Colorado' 'Connecticut'
 'Delaware' 'Florida' 'Georgia' 'Idaho' 'Illinois' 'Indiana' 'Iowa'
 'Kansas' 'Kentucky' 'Louisiana' 'Maine' 'Maryland' 'Massachusetts'
 'Michigan' 'Minnesota' 'Mississippi' 'Missouri' 'Montana' 'Nebraska'
 'Nevada' 'New Hampshire' 'New Jersey' 'New Mexico' 'New York'
 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania'
 'Rhode Island' 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah'
 'Vermont' 'Virginia' 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming']


In [7]:
print("Region : "+str(
    np.sort(dataset["Region"].unique())
))

Region : ['Midwest' 'Northeast' 'South' 'Southeast' 'West']


In [8]:
print("Retailer : "+str(
    np.sort(dataset["Retailer"].unique())
))

Retailer : ['Amazon' 'Foot Locker' "Kohl's" 'Sports Direct' 'Walmart' 'West Gear']


In [9]:
print("Sales Method : "+str(
    np.sort(dataset["Sales Method"].unique())
))

Sales Method : ['In-store' 'Online' 'Outlet']


In [10]:
df_product = dataset.groupby(by=["Product"])["Total Sales"].aggregate("sum").sort_values(ascending=False).reset_index()
df_product

Unnamed: 0,Product,Total Sales
0,Men's Street Footwear,1999192
1,Women's Apparel,1720630
2,Men's Athletic Footwear,1468116
3,Women's Street Footwear,1224756
4,Men's Apparel,1192682
5,Women's Athletic Footwear,1023899


In [11]:
def agg_total_sales(col):
    df = dataset.groupby(by=[col])["Total Sales"].aggregate("sum").sort_values(ascending=True).reset_index()

    return df

In [12]:
df_product = agg_total_sales("Product")
df_product

Unnamed: 0,Product,Total Sales
0,Women's Athletic Footwear,1023899
1,Men's Apparel,1192682
2,Women's Street Footwear,1224756
3,Men's Athletic Footwear,1468116
4,Women's Apparel,1720630
5,Men's Street Footwear,1999192


In [13]:
df_retailer = agg_total_sales("Retailer")
df_retailer

Unnamed: 0,Retailer,Total Sales
0,Amazon,643399
1,Walmart,677702
2,Kohl's,1021226
3,Sports Direct,1789845
4,Foot Locker,2067310
5,West Gear,2429793


In [14]:
df_region = agg_total_sales("Region")
df_region

Unnamed: 0,Region,Total Sales
0,Midwest,1358145
1,South,1446722
2,Southeast,1631786
3,Northeast,1863429
4,West,2329193


In [15]:
df_salesmethod = agg_total_sales("Sales Method")
df_salesmethod

Unnamed: 0,Sales Method,Total Sales
0,Online,2467614
1,Outlet,2595011
2,In-store,3566650


In [16]:
df_state = agg_total_sales("State")
df_state

Unnamed: 0,State,Total Sales
0,Nebraska,59301
1,Minnesota,73795
2,Iowa,74249
3,Wisconsin,77286
4,North Dakota,77361
5,Maryland,77591
6,Rhode Island,84483
7,South Dakota,84970
8,Indiana,88371
9,Maine,91974


In [17]:
# Show barplot
fig = px.bar(df_product, x="Total Sales", y="Product", text_auto=".4s")
fig.update_traces(
    marker_color=px.colors.sequential.algae
)
fig.update_layout(
    title="", 
    xaxis_title="", 
    yaxis_title=""
)
fig.show()

In [18]:
# Show barplot
fig = px.bar(df_retailer, x="Total Sales", y="Retailer", text_auto=".4s")
fig.update_traces(
    marker_color=px.colors.sequential.algae
)
fig.update_layout(
    title="", 
    xaxis_title="", 
    yaxis_title=""
)
fig.show()

In [19]:
# Show barplot
fig = px.bar(df_region, x="Total Sales", y="Region", text_auto=".4s")
fig.update_traces(
    marker_color=px.colors.sequential.algae
)
fig.update_layout(
    title="", 
    xaxis_title="", 
    yaxis_title=""
)
fig.show()

In [20]:
# Show barplot
fig = px.bar(df_salesmethod, x="Total Sales", y="Sales Method", text_auto=".4s")
fig.update_traces(
    marker_color=px.colors.sequential.algae
)
fig.update_layout(
    title="", 
    xaxis_title="", 
    yaxis_title=""
)
fig.show()