In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import statistics as st
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

## Performing Preliminary steps 

In [None]:
# Reading csv

sales = pd.read_csv('../input/sales-forecasting/train.csv')

In [None]:
# checking column names and viweing first few records
sales.columns
sales.head()

In [None]:
# dropping ROW ID
sales.drop('Row ID', axis = 1, inplace = True)
sales.columns

In [None]:
# Renaming columns

sales.rename(columns = {'Order ID' : 'order_id', 'Order Date' : 'order_date', 'Ship Date' : 'ship_date', 'Ship Mode' : 'ship_mode',
                        'Customer ID' : 'customer_id', 'Customer Name' : 'customer_name',
                        'Postal Code' : 'postal_code', 'Product ID' : 'product_id', 'Sub-Category' : 'sub_category',
                        'Product Name' : 'product_name'}, inplace = True)

In [None]:
# checking for null values

sales.isna().sum() # 11 null values in postal_code - Let's ignore for now

## Analyzing Data

In [None]:
# sales city wise
sales_city = sales.groupby('City').agg({'Sales' : sum})
sales_city.columns
sales_city.sort_values(by = 'Sales', ascending = False).head(10)

In [None]:
# distribution of sales region wise
sales_region = sales.groupby('Region').agg({'Sales' : sum})
sales_region

In [None]:
# count of number of orders region wise
count_region = sales.groupby('Region').agg({'Sales' : 'count'})
count_region

In [None]:
# plotting number or orders region wise
count_region.plot(kind = 'bar',
                  figsize = (12,5),
                  title = 'Region-wise Orders',
                  rot = 0) # East and West Region contribute the highest in terms of number of orders

In [None]:
# number of orders region and state wise
reg_st = sales.groupby(['Region', 'State']).agg({'Sales' : 'count'})
reg_st.head(20)

In [None]:
# filtering top 2 sub categories(sales wise) in each category
cat_per = sales.groupby(['Category', 'sub_category']).agg({'Sales' : sum})
cat_per.sort_values(by = 'Sales', ascending = False).groupby('Category').head(2).sort_index()

## Performing further analysis using difference between order date and shipment date

In [None]:
# calculating shipment date - order date
sales.dtypes
sales['order_date'] = pd.to_datetime(sales['order_date'], dayfirst = True)
sales['ship_date'] = pd.to_datetime(sales['ship_date'], dayfirst = True)

In [None]:
sales['shipment_in_days'] = sales['ship_date'] - sales['order_date']
sales['shipment_in_days'] = sales['shipment_in_days'].dt.days
sales.head()

In [None]:
# extracting year and month from order_date
sales['year'] = sales['order_date'].dt.year
sales['month'] = sales['order_date'].dt.month_name()
sales.head()

In [None]:
# aggregating sales at month level

monthly_sales = sales.groupby('month').agg({'Sales' : sum})

In [None]:
# plotting monthly sales

monthly_sales.plot(kind = 'bar',
                   title = 'Monthly Sales',
                   figsize = (14,5),
                   rot = 0) # September, November and december have highest sales

In [None]:
# calculating mean deliver days
st.mean(sales['shipment_in_days']) # 3.96
st.median(sales['shipment_in_days']) # 4

In [None]:
# visualizing sales over the course of time
sales.plot(x = 'order_date',
           y = 'Sales',
           figsize = (20,6))

In [None]:
delayed_orders = sales.loc[sales['shipment_in_days'] > 5]
delayed_orders.head()

# filtering orders getting shipped on time

timely_orders = sales.loc[~(sales['shipment_in_days'] > 5)]
timely_orders.head()

In [None]:
# creating a boolean field to determine whether an order was delayed or not
sales['is_delay'] = [True if s > 5 else False for s in sales['shipment_in_days']]
sales.head()

In [None]:
# plotting number of timely orders vs delayed orders
sales['is_delay'].value_counts().plot(kind = 'bar',
           figsize = (14,5),
           title = 'Timely vs Delayed',
           rot = 0)

## Analyzing the above two datasets separately to get the root cause behind the delayed shipment

In [None]:
# average number of orders daily for delayed orders

avg_ord = delayed_orders.groupby('order_date').agg({'Sales' : 'count'})
st.mean(avg_ord['Sales']) # 3.1

In [None]:
# average number of orders daily for timely orders

avg_ord_time = timely_orders.groupby('order_date').agg({'Sales' : 'count'})
st.mean(avg_ord_time['Sales']) # 6.8

### For timely orders, average daily orders > average daily orders for delayed orders 

In [None]:
# avg ticket size for delayed orders
avg_ticket_size_delayed = delayed_orders.groupby('order_date').agg({'Sales' : sum})
st.mean(avg_ticket_size_delayed['Sales']) # 690

# avg ticket size for timely orders
avg_ticket_size_timely = timely_orders.groupby('order_date').agg({'Sales' : sum})
st.mean(avg_ticket_size_timely['Sales']) # 1571

### Orders shipped on time have ticket size more than twice the ticket size of delayed orders 

## Customer Level Analysis 

In [None]:
# first and last purchase of each customer

first_purchase = sales.sort_values(by = 'order_date').groupby('customer_id').first()
last_purchase = sales.sort_values(by = 'order_date', ascending = False).groupby('customer_id').first()

In [None]:
first_purchase.head()

In [None]:
last_purchase.head()

In [None]:
# merging first and last purchase dataframes
customer = pd.merge(first_purchase, last_purchase, how = 'inner', on = ['customer_id'])
customer.shape
customer.columns

In [None]:
# filtering out order_date from customer

customer = customer[['order_date_x', 'order_date_y']]
customer.head()
customer.reset_index(inplace = True)
customer.head()

In [None]:
customer.rename(columns = {'order_date_x' : 'first_purchase', 'order_date_y' : 'last_purchase'}, inplace = True)
customer.head(10)

In [None]:
# calculating retention of each customer

customer['retention'] = customer['last_purchase'] - customer['first_purchase']
customer['retention'] = customer['retention'].dt.days
customer['retention'] = round(customer['retention']/365, 2)
customer.head(10)


In [None]:
# number of orders by each customer
number_of_orders = sales.groupby('customer_id').agg({'Sales' : 'count'})
number_of_orders.head()
customer['no_of_orders'] = list(number_of_orders['Sales']) # cannot assign a series to a column in dataframe
customer.head(20)

In [None]:
# average revenue from each customer

avg_rev = sales.groupby('customer_id').agg({'Sales' : np.mean})
customer['average_revenue'] = list(avg_rev['Sales'])
customer.head(20)