
## Task - 3: Exploratory Data Analysis - Retail Store

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
data = pd.read_csv("../input/tsf-datasets/SampleSuperstore.csv")
data.head()

In [None]:
data.shape

In [None]:
data.isnull().any()

In [None]:
data.nunique()

In [None]:
data.dtypes

In [None]:
# Check for duplicate values
data.duplicated().sum()

In [None]:
# Dropping duplicate values
data.drop_duplicates(inplace=True)

In [None]:
data.shape

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(),annot=True)

visualize the probability distribution of multiple samples in a single plot.


In [None]:
# To find out  Sales and Profit generated by the Superstore

print('Sales:' ,data['Sales'].sum())
print('Profit:' ,data['Profit'].sum())

#### Profit is more than that of sale but there are some areas where profit could be increased.

In [None]:
plt.style.use('seaborn')
data.plot(kind = 'scatter', figsize = (10,5) , x = 'Sales', y='Profit', c = 'Discount' , s = 20 , fontsize = 16)
plt.ylabel('TOTAL PROFITS', fontsize = 16)
plt.title('DEPENDENCY OF SALES AND PROFIT ON DISCOUNT' , fontsize = 16)
plt.show()

The above Scatterplot depicts that less the discount more is the Profits Discount is effecting profit to a certain extent and after that point Profits has no relation with Discount

In [None]:
fig,axs=plt.subplots(nrows=2,ncols=3,figsize=(20,10));

sns.countplot(data['Ship Mode'],ax=axs[0][0])
sns.countplot(data['Segment'],ax=axs[0][1])
sns.countplot(data['Quantity'],ax=axs[0][2])
sns.countplot(data['Category'],ax=axs[1][0])
sns.countplot(data['Region'],ax=axs[1][1])
sns.countplot(data['Discount'],ax=axs[1][2])

axs[0][0].set_title('Ship Mode')
axs[0][1].set_title('Segment')
axs[0][2].set_title('Quantity')
axs[1][0].set_title('Category')
axs[1][1].set_title('Region')
axs[1][2].set_title('Discount')


plt.tight_layout()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data['Sub-Category'])
plt.title('Sub-Category')

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data['State'])
plt.xticks(rotation=90)
plt.title('State')

### Distribution of the data using the plot

In [None]:
fig, axs = plt.subplots(ncols=2, nrows = 2, figsize = (20,15))
sns.distplot(data['Sales'], color = 'red',  ax = axs[0][0])
sns.distplot(data['Profit'], color = 'green',  ax = axs[0][1])
sns.distplot(data['Quantity'], color = 'orange',  ax = axs[1][0])
sns.distplot(data['Discount'], color = 'blue',  ax = axs[1][1])
axs[0][0].set_title('Sales')
axs[0][1].set_title('Profit')
axs[1][0].set_title('Quantity')
axs[1][1].set_title('Discount')
plt.show()

### Deal Analysis

In [None]:
state = data['State'].value_counts()

In [None]:
state.plot(kind='bar',figsize=(20,10),color="maroon")
plt.ylabel(' Number of deals')
plt.xlabel('States')

plt.title('State Wise Dealings')
plt.show()

In [None]:
city = data['City'].value_counts()
city = city.head(50)
city.plot(kind='bar',figsize=(20,10),color="blue")
plt.ylabel(' Number of deals')
plt.xlabel('States')

plt.title('State Wise Dealings')
plt.show()

### Customer Analysis

In [None]:
# To check maximum Sales and Profit in each segment

data.groupby('Segment')['Sales','Profit'].sum().plot.bar()
plt.title('SALES AND PROFIT IN EACH SEGMENT')
plt.legend()
plt.show()

##### So the graph presents that Consumer segment is the one which has maximum buying capacity Also they give maximum profit to Superstore whereas Home Office purchases less and add less profit to business

##### Now we will check Ship Mode Segment wise

In [None]:
# To check this we will use countplot 

sns.countplot(x='Segment' , hue='Ship Mode' , data=data)
plt.show()

In each segment most of the transaction has been shipped under Standard Class

In [None]:
data['Ship Mode'].value_counts()

In [None]:
shipmode = data.groupby(['Ship Mode'])[['Sales', 'Discount', 'Profit']].mean()

In [None]:
shipmode.plot.pie(subplots=True,
                     figsize=(18, 20), 
                     autopct='%1.1f%%', 
                     labels = shipmode.index)

Profit and Discount is high in First Class
Sales is high for Same day ship

#### Product Analysis

In [None]:
data.groupby('Category')['Sales','Profit'].sum().plot.bar()
plt.title('PROFIT AND SALES CATEGORY WISE')
plt.legend(loc = 1)
plt.show()

In [None]:
category = data.groupby(['Category'])[['Sales', 'Discount', 'Profit']].mean()
category

In [None]:
category.plot.pie(subplots=True, 
                     figsize=(18, 20), 
                     autopct='%1.1f%%', 
                     labels = category.index)

In [None]:
sub_category = data.groupby(['Sub-Category'])[['Sales', 'Discount', 'Profit']].mean()
sub_category.head()

In [None]:
plt.figure(figsize = (15,15))
plt.pie(sub_category['Sales'], labels = sub_category.index, autopct = '%1.1f%%')
plt.title('Sub-Category Wise Sales Analysis', fontsize = 20)
plt.legend()
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (15,15))
plt.pie(sub_category['Discount'], labels = sub_category.index, autopct = '%1.1f%%')
plt.title('Sub-Category Wise Discount Analysis', fontsize = 20)
plt.legend()
plt.xticks(rotation = 90)
plt.show()

In [None]:
sub_category.sort_values('Profit')[['Sales','Profit']].plot(kind='bar',
                                                              figsize= (10,5),
                                                              label=['Avg Sales Price($)','Profit($)'])

In [None]:
data[data['Category'] == 'Furniture'].groupby('Sub-Category')['Sales','Profit'].sum().plot.bar()
plt.title('SALES AND PROFIT FURNITURE CATEGORY WISE ')
plt.legend(loc = 1)
plt.show()

In [None]:
# To check the probable reason of loss 

data[data['Category'] == 'Furniture'].groupby('Sub-Category')['Discount'].mean().plot.bar()
plt.title('DISCOUNT GIVEN IN FURNITURE CATEGORY')
plt.legend(loc = 0)
plt.show()


From above Heatmap we concluded there is a Negative correlation in between Profit and Discount
whereas a Positive correlation between Profit and Sales

In [None]:
# Now we will check the Top Products Sold

data.groupby('Sub-Category')['Sales'].sum().sort_values(ascending=False).plot.bar()
plt.show()

With this we concluded that Phones,Chairs ,Storage,Tables and Binders are being sold at max consecutively.
Whereas Fasteners,Labels and Envelopes were sold the least

In [None]:
#To check the profit earned in all the  Sub-Categories

data.groupby('Sub-Category')['Profit'].sum().sort_values(ascending = False).plot.bar(color = 'brown')
plt.show()

Here we saw Copiers ,Phones,Accessories are top profit giving products to the store. whereas Store is incurring losses due to Tables ,Bookcases and suppliers.

### REGIONAL ANALYSIS

In [None]:
# To check maximum transactions made regionwise

data.Region.value_counts().plot.pie(autopct="%.1f%%")
plt.show()

In [None]:
region = data.groupby(['Region'])[['Sales', 'Discount', 'Profit']].mean()
region

In [None]:
region.plot.pie(subplots=True, 
                   figsize=(18, 20), 
                   autopct='%1.1f%%',
                   labels = region.index)

In [None]:
data.groupby('Region')['Sales','Profit'].sum().plot.bar()
plt.title('SALES AND PROFITS IN EACH REGION')
plt.legend()
plt.show

### OBSERVATIONS:

1) MAXIMUM SALES,PROFITS, TRANSACTIONS were made in WEST REGION

2) MAXIMUM SALES AND PROFIT in CONSUMER SEGMENT

3) MAXIMUM TRANSACTIONS were shipped in STANDARD CLASS irrespective of SEGMENT

4) LEAST PROFIT is incurred in FURNITURE CATEGORY irrespective of good amount of Sales

5) POSITIVE CORRELATION: Profit and Sales 

6) NEGATIVE CORRELATION: Profit and Discount