# Exploratory Data Analysis: Retail Superstore

Problem:  As a business manager, try to find out the weak areas where you can work to make more profit. What all business problems you can derive by exploring the data?

Exploratory Data Analysis is a process of examining or understanding the data and extracting insights or main characteristics of the data. EDA is generally classified into two methods, i.e. graphical analysis and non-graphical analysis.
EDA is very essential because it is a good practice to first understand the problem statement and the various relationships between the data features.

Dataset: https://bit.ly/3i4rbWl

In [None]:
import pandas as pd                                         #importing the libraries
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import datasets
import sklearn
from pandas import DataFrame
from pandas.plotting import scatter_matrix
from pandas_profiling import ProfileReport
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
import warnings
warnings.filterwarnings('ignore')

In [None]:
stores=pd.read_csv("E:\python\Spark_foundations\SampleSuperstore.csv")        #importing the dataset

data prepration and preprocessing 

In [None]:
stores.head()         #first 5 observations of dataset

In [None]:
stores.tail()      #last 5 observation of dataset

In [None]:
stores.nunique()      #unique values in dataset store

In [None]:
stores.columns            #nos. of column in dataset

In [None]:
stores['Ship Mode'].unique()                 #values of Ship Mode

In [None]:
stores['Segment'].unique()                  #values of segment

In [None]:
stores['Country'].unique()                   #Country

In [None]:
stores['City'].unique()                  #Cities in country

In [None]:
stores['State'].unique()                     #States in country

In [None]:
stores['Postal Code'].unique()              #postal code of cities

In [None]:
stores['Region'].unique()                      #regions in country

In [None]:
stores['Sub-Category'].unique()                 #subcategory of products

In [None]:
stores['Category'].unique()                       #category of products

In [None]:
stores['Sales'].unique()

In [None]:
stores['Sales'].mean()                  #mean of sales

In [None]:
stores['Sales'].mode()               #mode of sales

In [None]:
stores['Quantity'].unique()          #quantity

In [None]:
stores['Quantity'].mean()             #mean of quantity saled

In [None]:
stores['Quantity'].mode()

In [None]:
stores['Profit'].unique()            

In [None]:
stores['Profit'].mean()               #mean of profit

In [None]:
stores['Discount'].unique()              

In [None]:
stores['Discount'].mean()                  #mean of discount

In [None]:
stores.value_counts()                     #valuecounts of dataset

In [None]:
stores.isnull()            #checking missing values,if any

In [None]:
stores.isnull().sum()                 #sum of missing values,if any

In [None]:
stores.info()           #info about dataset

In [None]:
stores.describe()        # summary statistics

In [None]:
stores.shape                   #shape of datset

In [None]:
print("Columns: ", len(stores.columns))

In [None]:
print("Rows: ", len(stores))

In [None]:
stores.groupby('Profit').sum()['Sales']                    #profit vs sales

In [None]:
stores.groupby('Discount').sum()['Sales']                 #Discounts vs sales

In [None]:
stores.groupby('Quantity').sum()['Sales']                  #quantity vs sales

In [None]:
stores.groupby('Ship Mode').sum()['Sales']                  #shipping mode used to sell the products

In [None]:
stores.groupby('Category').sum()['Sales']                     #Category of items suppied for sales

In [None]:
stores.groupby('Postal Code').sum()['Sales']

In [None]:
stores.groupby('Region').sum()['Sales']                    #regions vs sales

In [None]:
stores.groupby('City').sum()['Sales']                         #city vs sales

In [None]:
stores.groupby('State').sum()['Sales']

Vizualization of variables


In [None]:
stores.groupby('Region').sum()['Sales'].plot.bar()            
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
stores.groupby('State').sum()['Sales'].plot.bar()
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
stores.groupby('Category').sum()['Sales'].plot.bar()
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
stores.groupby('Segment').sum()['Sales'].plot.bar()
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
stores.groupby('City').sum()['Sales'].plot.bar()
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
stores.groupby('Sub-Category').sum()['Sales'].plot.bar()
plt.xticks(rotation = 0)
plt.tight_layout()
plt.show()

In [None]:
sns.distplot(stores["Sales"],bins=30)                          #distribution plot of sales

In [None]:
sns.distplot(stores["Discount"],bins=30)             #distribution of Discount

In [None]:
sns.distplot(stores["Profit"],bins=30)                       #distribution of profit

In [None]:
sns.distplot(stores["Quantity"],bins=30)                 

In [None]:
stores.hist()                                #histplot
plt.show()

In [None]:
scatter_matrix(stores)                         #scatter plot
plt.figure(figsize=(8,10))
plt.show()

In [None]:
stores.plot(kind = 'box', subplots='True', sharex= 'False', sharey='False')                #dendogram
plt.show()

In [None]:
sns.pairplot(stores)                           #pairplot of variables

Profile Report using pandas profile feature: This gives us a detailed picture of EDA all together. 

In [None]:
profile = ProfileReport(stores, title="Sale Superstore", explorative=True)
profile

Top 15 Cities which generated Highest Sales:

In [None]:
city = stores.groupby("City").sum().sort_values("Sales", ascending = False).head(15)
city = city[["Sales"]].round(2)
city.reset_index(inplace = True)
city

In [None]:
plt.title("Top 15 Cities which generated Highest Sales", fontsize=16)
plt.bar(city["City"], city["Sales"],color= 'Orange',edgecolor='brown', linewidth = 2)
plt.xlabel("Cities",fontsize=10)  
plt.ylabel("Sales",fontsize=10)
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=10)
plt.figure(figsize = (10,6)) 
plt.show()

Segnent vs sales:

In [None]:
segment = stores.groupby("Segment").sum()
segment = segment[["Sales"]]
segment.reset_index(inplace = True)
segment

Total Revenue generated segments wise:

In [None]:
total_revenue = segment["Sales"].sum() 
total_revenue = str(int(total_revenue))
total_revenue = "$" + total_revenue
total_revenue # this is our total revenue

The Total Revenue generated by all the shipping modes - $2297200

Sales based on segment:

In [None]:
# The function which helps us to annotate the values in our donut or pie chart
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return  "${v:d}".format(p=pct,v=val)
    return my_autopct

In [None]:
# Creating a donut chart.
center = [5]
plt.pie(segment['Sales'], labels = segment['Segment'],colors =['Pink','Blue','Green'], 
        radius = 1.5, explode = [0.05,0.05,0.05], autopct = make_autopct(segment["Sales"]))
plt.pie(center, radius = 1.25, colors = "white")
label = plt.annotate('Total Revenue \n' + total_revenue, color = 'red', xy=(0, 0)
                     ,fontsize=12, ha="center")
plt.tight_layout()
plt.title("The sales based on the Segment", pad = 60, size = 25, color = "green")
plt.show()

Consumer segment generated the higher revenue: $1161401

Sales based on shipmode:

In [None]:
shipmode = stores.groupby("Ship Mode").sum()

shipmode = shipmode[["Sales"]]

shipmode.reset_index(inplace = True)

shipmode

In [None]:
#Donut Chart
center = [6]
plt.figure(figsize = (6,6))
plt.pie(shipmode['Sales'], labels = shipmode['Ship Mode'],colors = ['Red','Blue','Pink',"Green"], 
        radius = 2, autopct= make_autopct(shipmode["Sales"]))
plt.pie(center, radius = 1.7, colors = "white")
plt.title("The sales based on the Ship Mode", pad = 80, size = 25, color = "darkblue")
plt.tight_layout()
plt.show()

Standard class generated: $1358216

Sales based on sub-category:

In [None]:
subcat = stores.groupby("Sub-Category").sum().sort_values("Sales" , ascending = False).head()
subcat = subcat[["Sales"]].round(2)
subcat.reset_index(inplace = True)
subcat

Phones under subcategory generated the higher sales value: $330007.05

Sales based on Stores:

In [None]:
state = ['Alabama', 'Arizona' ,'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 
         'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
         'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana','Nebraska', 'Nevada', 'New Hampshire',
         'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
         'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
         'West Virginia', 'Wisconsin','Wyoming']
state_code = ['AL','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA',
              'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN',
              'TX','UT','VT','VA','WA','WV','WI','WY']

In [None]:
state = pd.DataFrame(state, state_code) 
state.reset_index(inplace=True)
state.columns = ['State Code','State']
state.head()

In [None]:
sales = stores.groupby(["State"]).sum().sort_values("Sales", ascending=False)
sales.reset_index(inplace=True) 
sales.drop('Postal Code',axis = 1, inplace = True)
sales.head(5)

In [None]:
sales= sales.sort_values('State', ascending=True)
sales.reset_index(inplace = True)
sales.head()

California under State generated the maximum profit: $76381,  
with given discount: $145.6

Correlation matrix among the variables:

In [None]:
corelation= stores.corr()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(corelation, xticklabels = corelation.columns, yticklabels = corelation.columns, annot = True)

By plotting a correlation matrix, we get a overview of how the features are related to one another