In [None]:
"""
Content for Retail Sales Analysis
    0. Clear Memory
    1. Import
    2. Read data
    3. Function
    4. Explore data
    5. Visualization
    6. Data Analysis
    
Resource on Kaggle: https://www.kaggle.com/jr2ngb/superstore-data
"""

# 0 Clear memory
%reset -f

In [None]:
# 1.1 Call data manipulation libraries
import pandas as pd
import numpy as np
import os
# 1.2 Plotting libraries to plot feature importance
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# 2.0 Set working directory and read file
print(os.getcwd())
os.listdir()
pd.options.display.max_columns = 300
# 2.1 Read train/test files
data = pd.read_csv("../input/superstore_dataset2011-2015.csv", encoding = "ISO-8859-1")

In [None]:
#3 Functions
# 3.1 Print 5 Rows for any column
def print_rows(name_column):
    return data[name_column][0:5]
# 3.2 Get Details of the Column
def describe_column(name_column):
    return data[name_column].describe()

In [None]:
# 4 Explore data
# 4.1 Shape
data.shape

In [None]:
# 4.2 Columns
data.columns 

In [None]:
# 4.3 Is Null
data.isnull().sum()

In [None]:
# 4.4 Head 5 Rows (Default)
data.head()

In [None]:
# 4.5 Tail gives last 5 rows
data.tail()

In [None]:
# 4.6 Information
data.info

In [None]:
# 4.7 Code for Order ID & its Columns
print_rows('Order ID')

In [None]:
describe_column('Order ID')

In [None]:
# 4.8 Code for Order Date & its Columns
print_rows('Order Date')

In [None]:
data['Order Date'] = pd.to_datetime(data['Order Date'])
describe_column('Order Date')

In [None]:
# 4.9 Code for Ship Date & its Columns
print_rows('Ship Date')

In [None]:
data['Ship Date'] = pd.to_datetime(data['Ship Date'])
describe_column('Ship Date')

In [None]:
# 4.10 Code for Ship Mode & its Columns
print_rows('Ship Mode')

In [None]:
describe_column('Ship Mode')

In [None]:
data['Ship Mode'].unique()

In [None]:
# 4.11 Code for Customer ID & its Columns
print_rows('Customer ID')

In [None]:
describe_column('Customer ID')

In [None]:
# 4.12 Code for Customer Name & its Columns
print_rows('Customer Name')

In [None]:
describe_column('Customer Name')

In [None]:
data['Customer Name'].unique()

In [None]:
# 4.13 Code for Segment & its Columns
print_rows('Segment')

In [None]:
describe_column('Segment')

In [None]:
data['Segment'].unique()

In [None]:
# 4.14 Code for City & its Columns
print_rows('City')

In [None]:
describe_column('City')

In [None]:
data['City'].unique()

In [None]:
# 4.15 Code for State & its Columns
print_rows('State')

In [None]:
describe_column('State')

In [None]:
data['State'].unique()

In [None]:
# 4.16 Code for Country & its Columns
print_rows('Country')

In [None]:
describe_column('Country')

In [None]:
data['Country'].unique()

In [None]:
# 4.17 Code for Postal Code & its Columns
print_rows('Postal Code')

In [None]:
describe_column('Postal Code')

In [None]:
# 4.18 Code for Market & its Columns
print_rows('Market')

In [None]:
describe_column('Market')

In [None]:
data['Market'].unique()

In [None]:
# 4.19 Code for Region & its Columns
print_rows('Region')

In [None]:
describe_column('Region')

In [None]:
data['Region'].unique()

In [None]:
# 4.20 Code for Product ID & its Columns
print_rows('Product ID')

In [None]:
describe_column('Product ID')

In [None]:
data['Product ID'].unique()

In [None]:
# 4.21 Code for Category & its Columns
print_rows('Category')

In [None]:
describe_column('Category')

In [None]:
data['Category'].unique()

In [None]:
# 4.22 Code for Sub-Category & its Columns
print_rows('Sub-Category')

In [None]:
describe_column('Sub-Category')

In [None]:
data['Sub-Category'].unique()

In [None]:
# 4.23 Code for Product Name & its Columns
print_rows('Product Name')

In [None]:
describe_column('Product Name')

In [None]:
data['Product Name'].unique()

In [None]:
# 4.24 Code for Sales & its Columns
print_rows('Sales')

In [None]:
describe_column('Sales')

In [None]:
# 4.25 Code for Quantity & its Columns
print_rows('Quantity')

In [None]:
describe_column('Quantity')

In [None]:
# 4.26 Code for Discount & its Columns
print_rows('Discount')
describe_column('Discount')

In [None]:
# 4.27 Code for Profit & its Columns
print_rows('Profit')

In [None]:
describe_column('Profit')

In [None]:
# 4.28 Code for Shipping Cost & its Columns
print_rows('Shipping Cost')

In [None]:
describe_column('Shipping Cost')

In [None]:
# 4.29 Code for Order Priority & its Columns
print_rows('Order Priority')

In [None]:
describe_column('Order Priority')

In [None]:
data['Order Priority'].unique()

In [None]:
# 5 Data Visualization
# 5.1 Market
plt.figure(figsize=(16,8))
data['Market'].value_counts().plot.bar()
plt.title('Market Wise Sales')
plt.ylabel('Count')
plt.xlabel('Market Region')
plt.show()
# APAC tops all the Markets

In [None]:
# 5.2 Top 20 
# 5.2.1 Top 20 Countries in sales
plt.figure(figsize=(16,8))
top20countries = data.groupby('Country')['Row ID'].count().sort_values(ascending=False)
top20countries = top20countries [:20]
top20countries.plot(kind='bar', color='green')
plt.title('Top 20 Countries in Sales')
plt.ylabel('Count')
plt.xlabel('Countries')
plt.show()
# United States as a Country tops all the Countries in Sales

In [None]:
# 5.2.2 Top 20 States in Sales
plt.figure(figsize=(16,8))
top20states = data.groupby('State')['Row ID'].count().sort_values(ascending=False)
top20states = top20states [:20]
top20states.plot(kind='bar', color='blue')
plt.title('Top 20 States in Sales')
plt.ylabel('Count')
plt.xlabel('States')
plt.show()
# California as a State tops all the States in Sales

In [None]:
# 5.2.3 Top 20 Cities in Sales
plt.figure(figsize=(16,8))
top20city = data.groupby('City')['Row ID'].count().sort_values(ascending=False)
top20city = top20city [:20]
top20city.plot(kind='bar', color='red')
plt.title('Top 20 Cities in Sales')
plt.ylabel('Count')
plt.xlabel('Cities')
plt.show()
# New York City as a City tops all the Cities in Sales

In [None]:
# 5.2.4 Top 20 Product by its Product ID's in Sales
plt.figure(figsize=(16,8))
top20pid = data.groupby('Product ID')['Row ID'].count().sort_values(ascending=False)
top20pid = top20pid [:20]
top20pid.plot(kind='bar', color='Yellow')
plt.title('Top 20 Products by Product IDs in Sales')
plt.ylabel('Count')
plt.xlabel('Product IDs')
plt.show()
# OFF-AR-10003651 Product tops all the Products in Sales

In [None]:
# 5.2.5 Top 20 Products in Sales
plt.figure(figsize=(16,8))
top20pname = data.groupby('Product Name')['Row ID'].count().sort_values(ascending=False)
top20pname = top20pname [:20]
top20pname.plot(kind='bar', color='Orange')
plt.title('Top 20 Products in Sales')
plt.ylabel('Count')
plt.xlabel('Products')
plt.show()
# Staples tops all the Products in Sales

In [None]:
# 5.2.6 Top 20 Profitable Customers
# Who are the top-20 most profitable customers.
plt.figure(figsize=(16,8))
bar_width = 0.2
opacity= 0.8
top20profit = data.sort_values('Profit', ascending=False)
top20 = top20profit.head(20)
top20[['Customer Name', 'Profit']]
sns.barplot(x = "Customer Name", y= "Profit", data=top20)  # plotting of top 20 profitable customers
plt.show()
# Tamara Chand tops all the Customers in Profits.

In [None]:
# 5.3 Segment
plt.figure(figsize=(16,8))
data['Segment'].value_counts().plot.bar()
# sns.countplot("Segment", data = data)           #Distribution of custome Segment
plt.title('Segment Wise Sales')
plt.ylabel('Count')
plt.xlabel('Segments')
plt.show()
# APAC tops all the Markets

In [None]:
# 5.4 Ship Mode
plt.figure(figsize=(16,8))
data['Ship Mode'].value_counts().plot.bar()
plt.title('Ship Mode Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Ship Modes')
plt.show()
# Standard Class tops all the Transport Methods

In [None]:
# 5.5 Region
plt.figure(figsize=(16,8))
data['Region'].value_counts().plot.bar()
plt.title('Region Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Regions')
plt.show()
# Central Region tops all the Sales in Regions

In [None]:
# 5.6 Category
plt.figure(figsize=(16,8))
data['Category'].value_counts().plot.bar()
plt.title('Category Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Categories')
plt.show()
# Office Supplies tops all the Sales in Categories

In [None]:
# 5.7 Sub-Category
plt.figure(figsize=(16,8))
data['Sub-Category'].value_counts().plot.bar()
plt.title('Sub-Category Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Sub Categories')
plt.show()
# Binders tops all the Sales in Sub Categories

In [None]:
# 5.8 Order Priority
plt.figure(figsize=(16,8))
data['Order Priority'].value_counts().plot.bar()
plt.title('Order Priority Wise Sales')
plt.ylabel('Sales')
plt.xlabel('Order Priorities')
plt.show()
# Medium tops all the Sales in Order Priorities

In [None]:
# 5.9 Relationship of Order Priority and Profitability
plt.figure(figsize=(16,8))
sns.boxplot("Order Priority","Profit",data= data)
plt.title('Relationship of Order Priority and Profitability')
plt.show()
# Profits slightly higher when Order priority is Medium

In [None]:
# 5.10 Distribution of Customers Market wise.
#6. What is the distribution of customers Market wise?
plt.figure(figsize=(16,8))
sns.countplot("Market",data = data)
plt.title('Distribution of Customers Market wise')
plt.show()
# Market has 7 levels. APAC has the largest # of customers followed by LATAM, and US in that order
# Canada has the least # of customers

In [None]:
# 5.11 Distribution of customers Market wise and Region wise
plt.figure(figsize=(16,8))
data['Region'].value_counts()
cmr = pd.DataFrame({'Count' : data.groupby(["Market","Region","Customer Name"]).size()}).reset_index()
sns.countplot("Market", hue= "Region", data = cmr)
plt.show()
#for APAC, the largest # of customers are basd out of Oceania, followed by Southeast Asia
#for US, the largest # of customers are based out of Western Region followed by East

In [None]:
# 5.12 Distribution of  Customers by Country & State - top 20
plt.figure(figsize=(16,8))
CusCountry = pd.DataFrame({'Count' : data.groupby(["Country","State"]).size()}).reset_index().sort_values('Count',ascending = False).head(20)
sns.barplot(x = "Country", y= "Count", hue="State", data = CusCountry.sort_values('Country'))
plt.show()
## US has the largest number of customers -California being the largest followed by New York, Washington, Illinois & Ohio
## UK has the next largest population of Customers -England

In [None]:
# 5.13 sales by product Category, Sub-category
plt.figure(figsize=(16,8))
sale_category = data.groupby(["Category","Sub-Category"])['Quantity'].aggregate(np.sum).reset_index().sort_values('Quantity',ascending = False)
sns.barplot(x = "Category", hue="Sub-Category", y= "Quantity", data=sale_category)
plt.show()
# Binders in Office Supplies tops the list.

In [None]:
# 5.14 Customer Segment
# 2. What is the distribution of our customer segment
plt.figure(figsize=(24,15))
sns.catplot(x="Segment", col="Market", data=data, kind="count")
plt.show()
# Market wise segments are shown.

In [None]:
# 5.15 Order Priority and Profit
#5. Relationship of Order Priority and Profit
plt.figure(figsize=(16,8))
sns.barplot("Order Priority", y= "Profit",data=data)
plt.show()
# Low Order of Priority has High Profit.

In [None]:
# 6 No Charts
# 6.1 Top 20 Long Standing Customers
#3. Who are our top-20 oldest customers
data['Order Date'] = pd.to_datetime(data['Order Date'])      
top20Cust= data.sort_values(['Order Date'], ascending=False).head(20)
top20Cust.loc[:,['Customer Name']]
# No Chart.

In [None]:
# 6.2 Customers visited only once
#4. Which customers have visited this store just once
Visit=data.groupby('Customer ID').apply(lambda x: pd.Series(dict(visit_count=x.shape[0])))
Visit.loc[(Visit.visit_count==1)]
# No Chart.
# 7 Customers visited only once.