In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/customer-analytics/Train.csv')

In [None]:
df.head()

In [None]:
# Checking for null values

df.isnull().sum()

In [None]:
df.describe().T

# Basic Visualisation of the dataset

In [None]:
df.head()

In [None]:
plt.figure(figsize = (15,8))
plt.subplot(1,3,1)
sns.countplot(data = df, x = 'Warehouse_block')
plt.xlabel('Warehouse Block', fontsize = 13)

plt.subplot(1,3,2)
sns.countplot(data = df, x = 'Mode_of_Shipment')
plt.xlabel('Shipment', fontsize = 13)

plt.subplot(1,3,3)
sns.countplot(data = df, x = 'Product_importance')
plt.xlabel('Product Importance', fontsize = 13)

plt.show()



## Inference:
- Warehouse block F holds most items.
- Maximum deliveries are done via Ship
- There are lot of low importance products and medium importance relative to the high importance products.

In [None]:
# How is the cost of the product distributed.
plt.figure(figsize = (15,8))
sns.displot(data=df, x='Cost_of_the_Product', kind = 'kde')
plt.xlabel("Cost of the product", fontsize = 13)
plt.show()

In [None]:
# Visualising the weight distribution of the delivered goods.

plt.figure(figsize = (15,8))
sns.displot(data=df, x='Weight_in_gms', kind = 'kde')
plt.xlabel("Weight", fontsize = 13)
plt.show()


## Finding how the variables are correlated to each other.

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize = (15,8))
sns.heatmap(data = corr, cmap = 'coolwarm', annot = True)

## Inference:
#### Positive correlation can be seen between the following variables:
- Reached on time & Discount offered 0.4
- Customer Care calls & cost of the product 0.32
- Prior Purchases & customer care calls 0.18
- Cost of the product and prior purchase 0.12
- Reached on time and customer rating 0.013
- Customer rating and customer care calls 0.012

# Bi-Variate Analysis

In [None]:
sns.displot(data = df, x = 'Discount_offered',
            hue = 'Reached.on.Time_Y.N',
            kind = 'kde')
plt.xlabel('Discount', fontsize = 13)
plt.title('Does giving discounts impact delivery time?', fontsize = 15)
plt.show()

## Inference:
#### 1 indicates the product did not reach in time and 0 when it does.
- We can see that on a normal basis a discount is given in both scenarios. between the range of 0 to 10.
- An observation can made that most of the time when deliveries are delayed discounts of more than 20$ are given

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(data=df, x = 'Customer_care_calls', y = 'Cost_of_the_Product', ci=False)
plt.xlabel('Customer Care Calls', fontsize = 12)
plt.ylabel('Cost of the product', fontsize = 12)
plt.title('How does the Cost of the product influence the Customer care calls?', fontsize = 20)
plt.show()

## Inference:
- An observation can be made which says that as the cost of the product rises, so does the number of customer care calls.
- With respect to the earlier plot which suggested that more discounts were given as the deliveries were being late, this could be a reason why more customer care calls are being made with higher price of products. A plot against Discount and Customer care calls can explain the correlation.

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(data = df, x = 'Customer_care_calls', y = 'Discount_offered', ci = False)
plt.xlabel('Customer Care Calls', fontsize = 13)
plt.ylabel('Discount offered', fontsize = 13)
plt.title('How do the customer calls impact the discount offered?', fontsize = 20)

## Inference:
- An observation can be made that as the customer care call count increases the discount decreases.
- The highest discounts are given to people with the lowest number of customer care calls which relates to delayed delivery of the product. 
- When many calls are made we can see that less discount is offered, but the reason for this could be that 
##### 1. The cost of the product is high, 
##### 2. The product must be delivered on a longer time frame.


In [None]:
df.head().T

In [None]:
sns.catplot(data = df, x = 'Customer_care_calls', 
            col = 'Customer_rating', hue = 'Reached.on.Time_Y.N',
            kind = 'count', col_wrap = 3)

## Inference:
- Almost every plot looks the same and relevant insight cannot be drawn.

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(data = df, x = 'Product_importance', hue = 'Reached.on.Time_Y.N')
plt.xlabel('Product Importance', fontsize = 13)
plt.show()

## Inference:
- An observation can be made that regardeless of what product importance, the products are delivered late nonetheless.
- Late delivery of products prompt the customers to leave a bad review and high discounts are needed to be given.

In [None]:
plt.figure(figsize = (15,8))
sns.catplot(data = df, x = 'Warehouse_block',
           col = 'Mode_of_Shipment',
           hue = 'Reached.on.Time_Y.N',
           kind = 'count')
plt.xlabel('Warehouse Block', fontsize = 13)
plt.show()

## Inference:
- Since Warehouse F holds the maximum number of products, we can see that there is a delay from it the most.
- Warehousing processing would need a quality assurance/check for better functioning.