## RETAIL ANALYTICS -  EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

In [None]:
data = pd.read_csv('../input/retail-analytics/RETAIL_ANALYTICS.csv', encoding = 'latin1')

In [None]:
data.columns = ['Month', 'Enquiry_Date', 'Enquiry_Id', 'Allocation_Status', 'Status',
                   'State', 'City', 'Pincode', 'Type_of_Project', 'No_of_Windows',
                   'Source', 'Zone', 'Dealer_Name', 'First_Action-Call_made',
                   'Date_DD/MM/YY', 'First-Action-Call-Status',
                   'Date of Appointment (DD/MM/YY)', 'Second Action-Customer Meeting',
       'Date-DD/MM/YY.1', 'Second-Action-Call-Status',
       'Third-Action-Quote-Given', 'Date-DD/MM/YY.2', 'Q-Val.(Rs.Lac)',
       'Quote-QTY', 'Order-(Recd\Lost)', 'Date-DD/MM/YY.3',
       'Order-Val.(Rs.Lac)', 'Order_QTY',
       'Quote_ID(as_per_match_to_CCC_Records)',
       'Remarks-Brand-and-value-if-lost-to-UPVC',
       'Second-Action-Call-Status.1', 'Remarks', 'Price_Per_Sft', 'Benefits',
       'Aesthetics', 'Reduce_Street_Noise', 'Low_Maintenance', 'Monsoon_Proof','Better_Lighting', 'Reduce_AC_Energy_Cost']


In [None]:
print('\nData\n')
print(data.head())
print("-------------------------------------------------------------------------------------------------")
print("\nData shape\n")
print(data.shape)
print("-------------------------------------------------------------------------------------------------")
print("\nColumns in data\n")
print(data.columns)
print("-------------------------------------------------------------------------------------------------")
print("\nDescribing the data\n")
print(data.describe())
print("-------------------------------------------------------------------------------------------------")
print("\nInformation about the data\n")
print(data.info)
print("-------------------------------------------------------------------------------------------------")
print("\nCorrelation in data\n")
print(data.corr())

### Listing down the Numerical columns in the Data

In [None]:
data.select_dtypes(include=('int','float'))

### Listing down the Categorical columns in the Data

In [None]:
data.select_dtypes(include='object')

## Plot 1 - Month wise total enquiries

In [None]:
fig = go.Figure(go.Bar(x=data.Month.value_counts().index,y=data.Month.value_counts().values))
fig.update_layout(
title = 'Month',
xaxis_title = 'Month',
yaxis_title = 'Count')
fig.show()

## Observations:
#### 1. Maximum enquiries has happened in the month of July 2016
#### 2. Minimum enquiries has happened in the month of October 2016

## Plot 2 - Allocation Status 

In [None]:
fig = go.Figure(go.Bar(x=data['Allocation_Status'].value_counts().index,y=data['Allocation_Status'].value_counts().values))
fig.update_layout(
title = 'Allocation Status',
xaxis_title = 'Allocation Status',
yaxis_title = 'Count')
fig.show()

## Observations:
### 1. Dealers have handledd more enquiries than Direct bussinesses.

### Plot 3: Enquiry Status

In [None]:
fig = go.Figure(go.Bar(x=data.Status.value_counts().index,y=data.Status.value_counts().values))
fig.update_layout(
    title="Enquiry Status",
    xaxis_title="Enquiry Status",
    yaxis_title="Count",)
fig.show()

## Inference:
### Only few of the enquiries are materialised.
#### *Note:*
#### *The dataset is imbalanced. For machine learning approach, we need to balance the dataset to avoid bias.*

### Plot 4: Cities dealt by PVC

In [None]:
fig = go.Figure(go.Bar(x=data.City.value_counts().index,y=data.City.value_counts().values))
fig.update_layout(
title = 'City',
xaxis_title = 'City',
yaxis_title = 'Count')
# data.City.value_counts().plot(kind='bar')
# plt.title('Cities dealt by PVC')
# plt.ylabel('Counts')
# plt.xlabel('City')

## Observation:
### 1. PVC manufacture has dealers in Chennai,Kanchipuram,Vellore,Krishnagiri,Villupuram,Pondicherry,Cuddalore.
### 2. Among all Chennai recieved maximum enquiries.

### Plot 5: Type of Projects

In [None]:
fig = go.Figure(go.Pie(labels=data['Type_of_Project'].value_counts().index,values=data['Type_of_Project'].value_counts().values))
fig.update_layout(title="Type of Project")
fig.show()
# data['Type_of_Project'].value_counts().plot(kind='pie')
# plt.title('Type of projects')


## Observation:
### 1. PVC manufacturer is handling two kinds of projects.One is Renovation and the other is New Construction type.
### 2. Maximum number of enquiries are happened for Renovation.

### Plot 6: Source of Enquiry

In [None]:
fig = go.Figure(go.Pie(labels=data['Source'].value_counts().index,values=data['Source'].value_counts().values))
fig.update_layout(title='Sources of Enquiries')
fig.show()
# data['Source'].value_counts().plot(kind='bar')
# plt.title('Source of enquiry')
# plt.ylabel('Count of enquiries')
# plt.xlabel('Source of enquiry')

## Observation:
### 1. Internet,Friends,Just dial,Current user are four sources through which the manufacturer got the enquiries.
### 2. More enquiries are through Internet.
### 3. Enquiries through Just dial are very low in number.

### Plot 7: Dealer wise count of enquiries

In [None]:
fig = go.Figure(go.Bar(x = data['Dealer_Name'].value_counts().index,y=data['Dealer_Name'].value_counts().values))
fig.update_layout(
title = 'Dealer Name',
xaxis_title = "Dealer's Name",
yaxis_title = "Count")
fig.show()
# data['Dealer_Name'].value_counts().plot(kind='bar')
# plt.ylabel('Count of enquiries')
# plt.xlabel('Dealers')

## Observation:
### 1. As per the plot, PVC manufaturer has 9 dealers. Here Pearlson and pearlson are the same dealer with change in case.  
### 2. PVC manufacturer has got maximum enquiries through direct contact

### Changing the dealer name 'pearlson'

In [None]:
type(data[data['Dealer_Name']=='pearlson'].Dealer_Name)

In [None]:
def dealer_name(x):
    if x == 'pearlson':
        return 'Pearlson'
    else:
        return x
data['Dealer_Name'] = data['Dealer_Name'].apply(dealer_name)

In [None]:
data['Dealer_Name'] = data['Dealer_Name'].apply(dealer_name)

In [None]:
data['Dealer_Name'].value_counts()

### Same dealer wise plot after editting 'pearlson'

In [None]:
fig = go.Figure(go.Bar(x = data['Dealer_Name'].value_counts().index,y=data['Dealer_Name'].value_counts().values))
fig.update_layout(
title = 'Dealer Name',
xaxis_title = "Dealer's Name",
yaxis_title = "Count")
fig.show()

### Plot 9: Dealer | Zonal wise count of enquiries

In [None]:
pd.crosstab(data['Dealer_Name'],data.Zone).iloc[:,0]
# .plot(kind='bar')

# plt.ylabel('Count of enquiries')
# plt.xlabel('Dealers')
# plt.title('Zone wise dealers')

In [None]:
fig = go.Figure()
for i in range(len(pd.crosstab(data['Dealer_Name'],data.Zone).columns)):
    fig.add_trace(go.Bar(go.Bar(x=pd.crosstab(data['Dealer_Name'],data.Zone).index,y=pd.crosstab(data['Dealer_Name'],data.Zone).iloc[:,i],name=pd.crosstab(data['Dealer_Name'],data.Zone).columns[i])))
fig.show()

Observation:
All the dealeres are dealing with the enquiries and dealer Sunbird is dealing with Chehhai and Kanchipuram zones.

### Plot 10: First action call status

In [None]:
fig = go.Figure(go.Bar(x = data['First-Action-Call-Status'].value_counts().index,y=data['First-Action-Call-Status'].value_counts()))
fig.update_layout(
title="First Action Call Status",
xaxis_title = "Status of the 1st call with customer",
yaxis_title= "Count of enquiries")
fig.show()

## Observation:
### 1. After the first enquiry call few of the enquiries are going for second discussion and others are not willing to go further  due to reasons like budget constraint,feasibility of the design, time frame constraint and some of them are just awareness calls.

### Plot 11: Second action call status

In [None]:
# data['Second-Action-Call-Status'].value_counts().plot(kind='bar')
# plt.title('Status after the Second call')
# plt.ylabel('Count of enquiries')
# plt.xlabel('Second discussion status')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
data['First-Action-Call-Status'].value_counts()

In [None]:
data['Second-Action-Call-Status'].value_counts()

In [None]:
fig = go.Figure(go.Bar(x=data['Second-Action-Call-Status'].value_counts().index,y=data['Second-Action-Call-Status'].value_counts().values))
fig.update_layout(
title = "Second Action call Status",
xaxis_title = "Status of the 2nd call with customer",
yaxis_title = "Count of enquiries")
fig.show()

## Observation:
### 1. After the discussion with the customers for the second time, quote has to be given to the interested ones(189 out of 191). One customer requested for further discussion.
### 2. From the observation, after the first call, there has been 192 customers, whose appointment has been fixed. But, int he current plot/data we have only 191 records. this is due to missing of data for the status fo Second action call for the enquiry ID "EC01071653533"

### Plot 12: Third discussion status

In [None]:
def third_action_quote(x):
    if x == 'yes':
        return 'YES'
    else:
        return x


In [None]:
data['Third-Action-Quote-Given'] = data['Third-Action-Quote-Given'].apply(third_action_quote)

In [None]:
fig = go.Figure()
for i in range(len(pd.crosstab(data['Third-Action-Quote-Given'],data.Status).columns)):
    fig.add_trace(go.Bar(go.Bar(x=pd.crosstab(data['Third-Action-Quote-Given'],data.Status).index,y=pd.crosstab(data['Third-Action-Quote-Given'],data.Status).iloc[:,i],name=pd.crosstab(data['Third-Action-Quote-Given'],data.Status).columns[i])))
fig.show()

## Observation:
### 1. Out of 189 customers who has gone through third discussion, 53 converted to business.

### Plot 13: Reasons for non coversion rate

In [None]:
fig = go.Figure()
for i in range(len(pd.crosstab(data[data['Status']=='Lost'].Status.count(),data['Order-(Recd\Lost)']).columns)):
    fig.add_trace(go.Bar(go.Bar(x=pd.crosstab(data[data['Status']=='Lost'].Status.count(),data['Order-(Recd\Lost)']).index,y=pd.crosstab(data[data['Status']=='Lost'].Status.count(),data['Order-(Recd\Lost)']).iloc[:,i],name=pd.crosstab(data[data['Status']=='Lost'].Status.count(),data['Order-(Recd\Lost)']).columns[i])))
fig.update_layout(
title = "Reasons for Non Conversion rate",
xaxis_title = "Non Conversion rate Segments",
yaxis_title = " Counts")
fig.show()

In [None]:
data['Order-(Recd\Lost)'].value_counts()

In [None]:
data[data['Status']=='Lost'].Status.count()

## Observation:
### Conversion rate is not achieved after the third discussion because of the following reasons
- Delivery time not possible
- Aluminium
- Wooden
- Product issue

In [None]:
data.Remarks.value_counts()

## Observation:
### 1. Maximum number of enquiries did not achieve convertion rates because of price issue(103 out of 171), budget constraint(26 out of 171) and design feasibility(23 out of 171)


### Handling null values

In [None]:
data.isna().sum()

In [None]:
#data['Order_QTY'].fillna(0, inplace=True)

In [None]:
#data.isna().sum()

In [None]:
#pd.set_option('max_colwidth',200)
#data['Order_QTY'].sum()

In [None]:
"""
data['Order_QTY'].value_counts().plot(kind='bar')
plt.title('Ordered quantity by lost enquiries')
plt.xlabel('Quantity ordered')
plt.ylabel('Count of enquiries')
"""


In [None]:
#data['Order_QTY'].value_counts()

In [None]:
#data_new = data[data['City'] == 'CHENNAI'] 

In [None]:
data_new = data.copy()

In [None]:
data_new.shape

In [None]:
data_new['Allocation_Status'].value_counts()

In [None]:
data_new.shape

In [None]:
data_new.Status.value_counts()

In [None]:
data_new

In [None]:
data1 = data_new.copy()

In [None]:
data1.shape

### Percentage of missing values in a column

In [None]:

for col in data1.columns:
    if data1[col].isnull().sum():
        print("Missing percentage in ",col," is ",round((data1[col].isnull().sum())*100/data1.shape[0],2))

### Plot 14: Percentage of missing values in a column

In [None]:
NA_col = data1.isnull().sum()
NA_col = NA_col[NA_col.values >(0.5*len(data1))]
plt.figure(figsize=(20,4))
NA_col.plot(kind='bar')
plt.title('List of Columns & NA counts where NA values are more than 50%')
plt.show()

In [None]:
Observation:
1. One can see visually missing value's >50% 

### Various types of imputation

In [None]:
data_new.drop(['Month','Enquiry_Id','Enquiry_Date','No_of_Windows','State','Pincode','City',
               'First_Action-Call_made','Date_DD/MM/YY','Date of Appointment (DD/MM/YY)',
               'Date-DD/MM/YY.2','Quote_ID(as_per_match_to_CCC_Records)','Date-DD/MM/YY.3',
               'Second-Action-Call-Status.1','Aesthetics','Reduce_Street_Noise','Low_Maintenance',
               'Remarks-Brand-and-value-if-lost-to-UPVC',
               'Monsoon_Proof','Better_Lighting', 'Reduce_AC_Energy_Cost'],axis ='columns',inplace=True)

### Droping redundant columns/attributes

In [None]:
data_new.describe()

### Plot: 15 Outlier Analysis for "Q-Val.(Rs.Lac)"

In [None]:
%matplotlib inline
sns.boxplot(data_new["Q-Val.(Rs.Lac)"])
plt.show

### Plot: 16 Outlier Analysis for "Order-Val.(Rs.Lac)"

In [None]:
%matplotlib inline
sns.boxplot(data_new["Order-Val.(Rs.Lac)"])
plt.show

### Plot: 17 Outlier Analysis for Price per square feet

In [None]:
%matplotlib inline
sns.boxplot(data_new["Price_Per_Sft"])
plt.show

## DEALER ANALYSIS

In [None]:
data_new = data.copy()

In [None]:
data_new.shape

In [None]:
data_new['Dealer_Name'].value_counts()

In [None]:
data_new[data_new['Dealer_Name']=='Sunbird']['Status'].value_counts()

In [None]:
pd.crosstab(data['Dealer_Name'],data.Status)

In [None]:
fig = go.Figure()
for i in range(len(pd.crosstab(data['Dealer_Name'],data.Status).columns)):
    fig.add_trace(go.Bar(go.Bar(x=pd.crosstab(data['Dealer_Name'],data.Status).index,y=pd.crosstab(data['Dealer_Name'],data.Status).iloc[:,i],name=pd.crosstab(data['Dealer_Name'],data.Status).columns[i])))
fig.update_layout(
title = "Dealer enquiries Status",
xaxis_title = "Dealer",
yaxis_title = "Count of enquiries materialised and immaterialised")
fig.show()

## Observation:
### 1. Direct has most number of enquiries.
### 2. Dealer ITP has lost all of them.
### 3. Dealer sunbird is doing a good job in business conversion.

## Inference:
### 1. Company must analyse the gaps present between the direct and the dealers approach.
### 2. ITP, Rajtechnis and Winfratech dealers must be closely reviewed and action has to be taken.
### 3. Further study has to be done in picking the important variables and loss prediction has to be done through machine learning models.