# Supermarket Data Analysis

In this project, we'll be performing some exploratory data analysis on the sample supermarket data.


# Objectives

* Performing EDA on the data to derive insights on how the profits can be increased.
* Discovering the weak areas of the sales deparment in order to improve the sales.
*  Discovering hidden trends within the data that will allow the sales deparment to cater to region-specific needs of the buyers.


In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
!pip install pywaffle --quiet
from pywaffle import Waffle

In [None]:
df= pd.read_csv("../input/thesparkfoundation/SampleSuperstore.csv")

In [None]:
df

In [None]:
df.shape

In [None]:
df.isnull().any()

In [None]:
quant=df[["Quantity", "Profit"]].groupby(['Quantity'], as_index=False).mean().sort_values(by='Profit', ascending=False)
plt.figure(figsize=(10,8))

sns.barplot(x='Quantity', y='Profit', data=quant)

In [None]:
cat=df[["Category", "Profit"]].groupby(['Category'], as_index=False).mean().sort_values(by='Profit', ascending=False)
plt.figure(figsize=(10,8))

sns.barplot(x='Category', y='Profit', data=cat)

In [None]:
sub_cat=df[["Sub-Category", "Profit"]].groupby(['Sub-Category'], as_index=False).mean().sort_values(by='Profit', ascending=False)
plt.figure(figsize=(20,15))

sns.barplot(x='Sub-Category', y='Profit', data=sub_cat)

In [None]:
numerical = ['Sales','Quantity','Discount','Profit']

In [None]:
df[numerical].hist(bins=25, figsize=(20,10), layout=(2, 2))

In [None]:

categorical = ['Ship Mode','Segment','State','Region','Category','Sub-Category']
fig, ax = plt.subplots(3, 2, figsize=(30, 15))

plt.subplots_adjust(hspace=0.7)
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(df[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [None]:
state_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','District of Columbia': 'WA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
df['state_code'] = df.State.apply(lambda x: state_code[x])

In [None]:
state = df[['Sales', 'Profit', 'state_code']].groupby(['state_code']).sum()


fig = go.Figure(data=go.Choropleth(
    locations=state.index, 
    z = state.Sales, 
    locationmode = 'USA-states', 
    colorscale = 'Reds',
    colorbar_title = 'Sales in USD',
))

fig.update_layout(
    title_text = 'Total State-Wise Sales',
    geo_scope='usa',
    height=800,
)

fig.show()
5

Highest sales in the state of California $450K of goods.  New York second highest sales,$300k of goods.  The states of Texas and Washington $170K and 140K in sales respectively.

In [None]:
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True, mask=matrix)

* there is strong positive correlation between sales price and profit
* negative correlation between discount and sales price

In [None]:
fig, ax = plt.subplots(6, 1, figsize=(20,50))
plt.subplots_adjust(hspace=0.4)
for var, subplot in zip(categorical, ax.flatten()):
    sns.boxplot(x=var, y= 'Profit', data=df, ax=subplot,showfliers=False)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [None]:
state['profit_to_sales'] = state['Profit'] / state['Sales']

# adding state name
state_name = {v: k for k, v in state_code.items()}
state['States'] = state.index
state['States'] = state.States.apply(lambda x: state_name[x])

# sorting the dataframe
state = state.sort_values(by = ['profit_to_sales'], ascending=True)

In [None]:
fig = px.bar(state, x = 'profit_to_sales', y = 'States', title = 'PRICE TO SALES RATIO',
            color = 'Profit', color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(
    autosize=False,
    height=1000,
    xaxis = dict(
        tickmode = 'array',
        ticktext = state.States,
        title='Profit to Sales Ratio',
    ),
    yaxis=dict(title='State'),
)
fig.show()

* Ohio has the worst profit-to-sales ratio in terms of total sales and the losses beared.
* Delaware has the highest profit-to-sales ratio.
* New York, California and Washington DC don't have the highest profit-to-sales ratio. This means that the company can further improve its sales/profits in these states.

In [None]:
ship_segment = df.groupby(['Segment'])
segment_list = df.Segment.value_counts().index
cat_list = df.Category.value_counts().index

for segment in segment_list:
    seg_shipping = ship_segment.get_group(segment)
    standard, second, first, same = [], [], [], []
    for cat in cat_list:
        count = seg_shipping.groupby(['Category']).get_group(cat)['Ship Mode'].value_counts()
        standard.append(count[0]), second.append(count[1]), first.append(count[2]), same.append(count[3])
        
    fig = go.Figure()
    fig.add_trace(go.Bar(x = cat_list,y = standard,name='Standard Class',marker_color='rgb(137,51,51)'
                        ))
    fig.add_trace(go.Bar(x = cat_list,y = second,name='Second Class',marker_color='rgb(234,84,84)'
                        ))
    fig.add_trace(go.Bar(x = cat_list,y = first,name='First Class',marker_color='rgb(250,127,78)'
                        ))
    fig.add_trace(go.Bar(x = cat_list,y = same,name='Same Day',marker_color='lightsalmon'
                        ))

    fig.update_layout(
        barmode ='group',
        width = 800,
        title = segment.upper(),
        yaxis = dict(title = 'Number of Deliveries'))
    fig.show()

* Standard class shipping is the most used shipping method across all consumer segments and product categories.
* Across all three consumer segments, office supplies are the most bought products, and the technology items are the least bought products.

In [None]:
df["Cost"] = df['Sales']/df['Quantity']

#finding profit per sold items

df['Profit'] = df['Profit']/df['Quantity']

#Grouping Data
data_group_one = df[['Ship Mode','Segment','Category','Sub-Category','Cost','Discount','Profit']]
data_group_one = data_group_one.groupby(['Ship Mode','Segment','Category','Sub-Category'],as_index=False).mean()

#Data for first Class & consumer
data_group_1 = data_group_one[data_group_one['Ship Mode'] == 'First Class']
data_group_1 = data_group_1[data_group_1['Segment'] == 'Consumer']

#Data for Same Day & consumer
data_group_2 = data_group_one[data_group_one['Ship Mode'] == 'Same Day']
data_group_2 = data_group_2[data_group_2['Segment'] == 'Consumer']

#Data for Second Class & consumer
data_group_3 = data_group_one[data_group_one['Ship Mode'] == 'Second Class' ]
data_group_3 = data_group_3[data_group_3['Segment'] == 'Consumer']

#Data for Standard Class & consumer
data_group_4 = data_group_one[data_group_one['Ship Mode'] == 'Standard Class']
data_group_4 = data_group_4[data_group_4['Segment'] == 'Consumer']


In [None]:
data_group_1

In [None]:
data_group_2

In [None]:
data_group_3

In [None]:
data_group_4

#  Loss incurred items groupwise

* data_group_1:Bookcases,Tables,Machines
* data_group_2:Tables
* data_group_3:Bookcases,Tables,Supplies
* data_group_4:Bookcases,Tables,Supplies,Machines


* BookCases: In bookcases according to pattern shown in the data when cost per item is around 100 and discount is around 0.1% and where cost per item is around 200 and discount is around 0.2% there is profit.So,Discount in these items should increase by 0.1% per 100 increase in the cost of the item.

* Tables: In case of tables where ever discount is around 0.2% there is loss so discount here should be around 0.1%.
 
* Machines:The problem is with the segment consumer they buy less of this item so the cost should be increased and discount should be negligible.
 
* Supplies:Here problem in the ship mode where discount must be around 0.01 in first class while in standard class there should be no discount.