In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as ps
import numpy as ny
import seaborn as sn
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# reading the dataset(csv file)


ecomm_records_dataset = ps.read_csv('/kaggle/input/us-ecommerce-record-2020/US  E-commerce records 2020.csv')
ecomm_records_dataset.head(10)


In [None]:
# Primary checks
ecomm_records_dataset.info

In [None]:
ecomm_records_dataset.describe()

In [None]:
pandas_profile_key = ProfileReport(ecomm_records_dataset,title="E-commerce records 2020 Report")
pandas_profile_key.to_notebook_iframe()

In [None]:
## But hey, let's also checkout isnull, need to drop, etc.

ecomm_records_dataset.isnull().sum() # Wow !

**A look at the first few rows may tell us, the IDs labels may not be of use to categorize data, similarly order date, postal code,region (since city may just be granular enough, just saying!) and product name have less relevance. Hence dropping them**

In [None]:
## Order date may seem relevant if time based records are needed. so keeping it as is. Dropping all others.

clean_ecomm_records = ecomm_records_dataset.drop(columns=['Row ID','Order ID','Customer ID','Country','Postal Code','Product ID','Product Name'])
clean_ecomm_records.head(10)

**EDA - Viz - 1. Let's start with sales vs city / region / state**

In [None]:
plt.figure(figsize=(20,15))
sn.barplot(x='Sales',y='State',data=clean_ecomm_records)

In [None]:
# Below is a pairplot code - but seems overwhelming, so ignoring for now.
"""
sales_pairplot_df = clean_ecomm_records[['City','State','Region','Sales']]
plt.figure(figsize=(50,45))
sn.pairplot(y_vars='Sales',x_vars=['City'],data=sales_pairplot_df,height=30)
"""

plt.figure(figsize=(20,10))
sn.barplot(x='Sales',y='Region',data=clean_ecomm_records)

In [None]:
# Plotting Sales against categories.
plt.figure(figsize=(20,10))
sn.barplot(x=clean_ecomm_records['Category'],y=clean_ecomm_records['Sales'],palette = "Blues")

In [None]:
# Plotting Sales against sub-categories.
subcat_vs_sales = clean_ecomm_records.groupby('Sub-Category')['Sales'].sum()
plt.figure(figsize=(18,15))
barplot2 = sn.barplot(x=subcat_vs_sales.index,y=subcat_vs_sales.values,palette = "Oranges")
barplot2.set(xlabel="Sub-categories", ylabel = "Sales")

In [None]:
# Pair plot for a region wise - larger view - a bit overwhelming ?
plt.figure(figsize=(18,15))
sn.pairplot(clean_ecomm_records,hue="Region")

**EDA - Viz - 2. Let's see profits**

In [None]:
# Plotting Region against Profits.
region_vs_profit = clean_ecomm_records.groupby('Region')['Profit'].sum()
plt.figure(figsize=(18,15))
barplot3 = sn.barplot(x=region_vs_profit.index,y=region_vs_profit.values,palette = "mako_r")
barplot3.set(xlabel="Region", ylabel = "Profit")

# West and East lead on profits !

In [None]:
# Plotting Region against Profits.
region_vs_profit = clean_ecomm_records.groupby('Discount')['Profit'].sum()
plt.figure(figsize=(18,15))
barplot4 = sn.barplot(x=region_vs_profit.index,y=region_vs_profit.values,palette = "mako")
barplot4.set(xlabel="Discount", ylabel = "Profit")

# More discount means more loss ? Interesting.

**EDA - Viz - 3. Let's see Losses**

In [None]:
# Which states make for losses ?
losses_df = clean_ecomm_records.loc[clean_ecomm_records['Profit']<=0]
losses_df['Profit'] = losses_df['Profit'].abs()
states_vs_loss = losses_df.groupby('State')['Profit'].sum()

# Same relationship with a Pie plot.
plt.figure(figsize=(38,21))
pie_chart1 = states_vs_loss.plot.pie(autopct="%.1f%%")

# Loss making states( read all numbers as a % loss)

In [None]:
# Which Categories make for losses ?

subcats_vs_loss = losses_df.groupby('Sub-Category')['Profit'].sum()

# Same relationship with a Pie plot.
plt.figure(figsize=(38,21))
pie_chart3 = subcats_vs_loss.plot.pie(autopct="%.1f%%")

# Loss making categories( read all numbers as a % loss)

In [None]:
# Which segments make for losses ?
segment_vs_loss = losses_df.groupby('Segment')['Profit'].sum()

# Same relationship with a Pie plot.
plt.figure(figsize=(38,21))
pie_chart3 = segment_vs_loss.plot.pie(autopct="%.1f%%")

# Loss making categories( read all numbers as a % loss)

In [None]:
# Lastly, Ship mode preferences among states
# 1 - First Class vs States


shipmode_dummies = ps.get_dummies(clean_ecomm_records['Ship Mode'])

shipmode_ecom_records = ps.concat([clean_ecomm_records['State'],shipmode_dummies],axis = 1)

state_vs_fc = shipmode_ecom_records.groupby('State')['First Class'].sum()
plt.figure(figsize=(28,19))
barplot6 = sn.barplot(y=state_vs_fc.index,x=state_vs_fc.values,palette = "mako_r")
barplot6.set(ylabel="State", xlabel = "FC")


In [None]:
# Lastly, Ship mode preferences among states
# 2 - Same Day vs States


state_vs_sd = shipmode_ecom_records.groupby('State')['Same Day'].sum()
plt.figure(figsize=(28,19))
barplot7 = sn.barplot(y=state_vs_sd.index,x=state_vs_sd.values,palette = "seismic_r")
barplot7.set(ylabel="State", xlabel = "SD")


**More insights could be built accordingly... All the best !**