# Analyse the factors affecting the Bigmart Sales

## Import packages

In [None]:
#Import python libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

## Importing the data

In [None]:
#Import the heart data
data = pd.read_csv("../input/bigmart-sales/bigmart_sales.csv")
data.head()

## Basic Statistics

In [None]:
data.info()

<font color='blue'>**OBSERVATION :** There is no mising data</font>

In [None]:
data.shape

In [None]:
data.describe()

<font color='blue'>**OBSERVATION :** <br>
Minimum Item_Weight is 4.55 and Maximum is 21.35 <br>
Outlet_Establishment_Year ranges from 1985 to 2009 <br>
Minimum Item_Outlet_Sales is 34 and Maximum is 13086<br>...<br></font>

In [None]:
#data[data['Item_MRP']>260]
#data[data['Item_Visibility']<0.02]
#data[data['Outlet_Establishment_Year']==1985]['Outlet_Type'].value_counts()
#data['Item_MRP'].sort_values(ascending=False).head()
#data.Outlet_Establishment_Year.value_counts()

In [None]:
data.select_dtypes(include=['object']).describe(include='all')

<font color='blue'>**OBSERVATION :** <br>
Item_Fat_Content has 2 categories, "Low Fat" items are repeated 3955 times in the data <br>
...<br></font>

## Exploratory Data Analysis (EDA)

Item_Weight  | 	Item_Visibility	 |  Item_MRP  |    Outlet_Establishment_Year	|   Item_Outlet_Sales
sns.pairplot(data=data[['Item_Weight','Item_Visibility','Item_Outlet_Sales']])
sns.pairplot(data=data[['Item_MRP','Outlet_Establishment_Year','Item_Outlet_Sales']])

**Compare the features 'Item_Weight','Item_Visibility','Item_Outlet_Sales' using pairplot**

In [None]:
sns.pairplot(data=data[['Item_Weight','Item_Visibility','Item_Outlet_Sales']])

<font color='blue'>**OBSERVATION :** <br>
    Item_Weight shows unusual peak around 13.  This needs further investigation<br>
    Item_Visibility has decresing trend, it means the there are more items with less visibility and vice versa<br>
    Overall Outlet sales of items with less visibility is more and vice versa<br>
    There is no corelation between Item weight and Item visibility<br>
</font>

In [None]:
#Lets plot histogram of Item_Weight to unravel the mystery of the unusal spike
plt.figure(figsize=(15,8))
sns.histplot(data=data, x="Item_Weight", kde=True, bins=300)

<font color='blue'>**OBSERVATION :** There is a peak around 12.8.  This needs further investigation</font>

In [None]:
#Lets zoom into the histogram
plt.figure(figsize=(15,8))
sns.histplot(data=data[(data['Item_Weight']>12.7) & (data['Item_Weight']<13.1)], x="Item_Weight", kde=True)

<font color='blue'>**OBSERVATION :** The peak is around 12.86.  This needs further investigation</font>

In [None]:
#Lets observe the data items where the Item_Weight is between 12.8 and 12.9
data[(data['Item_Weight']>12.8) & (data['Item_Weight']<12.9)]

<font color='blue'>**OBSERVATION :** It can be seen that there are 1498 entries around this range.  These items belong to different categories, however the weight is nearly same.  This means there is error in the data</font>

**Compare the features 'Item_MRP','Outlet_Establishment_Year','Item_Outlet_Sales' using pairplot**

In [None]:
sns.pairplot(data=data[['Item_MRP','Outlet_Establishment_Year','Item_Outlet_Sales']])

<font color='blue'>**OBSERVATION :** <br>
    Item_Outlet_Sales is directly proportional to Item_MRP
</font>

**Find correlation (corr) features in the 'data'**

In [None]:
data.corr()

**Find heatmap of correlation (corr) features in the 'data'**

In [None]:
ax = plt.subplots(figsize=(12, 5));
ax = sns.heatmap(data.corr(), annot = True) ;

<font color='blue'>**OBSERVATION :**<br>
    Item_MRP is strong +ve correlation with Item_Outlet_Sales
</font>

**Find correlation (corr) features in the 'data'against 'Item_Outlet_Sales'**

In [None]:
data.corr()['Item_Outlet_Sales'].sort_values(ascending=False)

<font color='blue'>**OBSERVATION :**<br>
    Item_MRP is strong +ve correlation with Item_Outlet_Sales
</font>

Item_Fat_Content  |	Item_Type  |  Outlet_Identifier  |  Outlet_Type  |  Outlet_Size  |  Outlet_Location_Type | Outlet_Establishment_Year
sns.violinplot(x='Item_Fat_Content', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Item_Type', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Outlet_Identifier', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Outlet_Type', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Outlet_Size', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Outlet_Location_Type', y='Item_Outlet_Sales', data=data)
sns.violinplot(x='Outlet_Establishment_Year', y='Item_Outlet_Sales', data=data)

**Visualize the 'data' using violinplot - 'Item_Fat_Content' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Item_Fat_Content', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :**<br>
    Most of the items (Low Fat and Normal) sold are in the price range of 1000, the number of items sold decreases as the price increases
</font>

**Visualize the 'data' using violinplot - 'Item_Type' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Item_Type', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :**<br>
    Most of the items sold are in the price range of 1000, the number of items sold decreases as the price increases
</font>

**Visualize the 'data' using violinplot - 'Outlet_Identifier' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Outlet_Identifier', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :**<br>
    Outlet OUT019 only sells the items that have price less than 2000
</font>

**Visualize the 'data' using violinplot - 'Outlet_Type' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Outlet_Type', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :**<br>
    Grocery Store sells the items with price less than 2000
</font>

**Visualize the 'data' using violinplot - 'Outlet_Size' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Outlet_Size', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :**</font>

**Visualize the 'data' using violinplot - 'Outlet_Location_Type' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Outlet_Location_Type', y='Item_Outlet_Sales', data=data)

**Visualize the 'data' using violinplot - 'Outlet_Establishment_Year' vs 'Item_Outlet_Sales'**

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x='Outlet_Establishment_Year', y='Item_Outlet_Sales', data=data)

<font color='blue'>**OBSERVATION :** </font>

Item_Fat_Content	Item_Type	Outlet_Identifier	Outlet_Type	Outlet_Size	Outlet_Location_Type
data.groupby(by="Item_Fat_Content")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()
data.groupby(by="Item_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()
data.groupby(by="Outlet_Identifier")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()
data.groupby(by="Outlet_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()
data.groupby(by="Outlet_Size")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()
data.groupby(by="Outlet_Location_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

**Find the total 'Item_Outlet_Sales' by 'Item_Fat_Content'**

In [None]:
data.groupby(by="Item_Fat_Content")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales of Low Fat items is more then Regular</font>

**Find the total 'Item_Outlet_Sales' by 'Item_Type'**

In [None]:
data.groupby(by="Item_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales of Fruits and Vegetables and Snack Foods is the highest</font>

**Find the total 'Item_Outlet_Sales' by 'Outlet_Identifier'**

In [None]:
data.groupby(by="Outlet_Identifier")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales of OUT027 is highest and OUT019 is the lowest</font>

**Find the total 'Item_Outlet_Sales' by 'Outlet_Type'**

In [None]:
data.groupby(by="Outlet_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales of SuperMarket Type1 is highest and Grocery Store is least </font>

**Find the total 'Item_Outlet_Sales' by 'Outlet_Size'**

In [None]:
data.groupby(by="Outlet_Size")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales in the Medium size outlets is highest and High size outlets is least </font>

**Find the total 'Item_Outlet_Sales' by 'Outlet_Location_Type'**

In [None]:
data.groupby(by="Outlet_Location_Type")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** Sales in Tier3 cities is highest and Tier2 cities is lowest</font>

**Find the total 'Item_Outlet_Sales' by 'Outlet_Establishment_Year'**

In [None]:
data.groupby(by="Outlet_Establishment_Year")['Item_Outlet_Sales'].sum().sort_values(ascending=False).plot.bar()

<font color='blue'>**OBSERVATION :** The outlets established in 1985 have highest sales</font>

# BELOW SECTION IS OPTIONAL 

Outlet_Establishment_Year | Item_Fat_Content  |	  Item_Type	 |   Outlet_Identifier  |	Outlet_Type	 |    Outlet_Size	|    Outlet_Location_Type

pd.crosstab(data.Outlet_Establishment_Year, data.Item_Fat_Content, margins=True)
pd.crosstab(data.Outlet_Establishment_Year, data.Item_Type, margins=True)
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Identifier, margins=True)
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Type, margins=True)
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Size, margins=True)
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Location_Type, margins=True)

pd.crosstab(data.Item_Fat_Content, data.Item_Type, margins=True)
pd.crosstab(data.Item_Fat_Content, data.Outlet_Identifier, margins=True)
pd.crosstab(data.Item_Fat_Content, data.Outlet_Type, margins=True)
pd.crosstab(data.Item_Fat_Content, data.Outlet_Size, margins=True)
pd.crosstab(data.Item_Fat_Content, data.Outlet_Location_Type, margins=True)

pd.crosstab(data.Item_Type, data.Outlet_Identifier, margins=True)
pd.crosstab(data.Item_Type, data.Outlet_Type, margins=True)
pd.crosstab(data.Item_Type, data.Outlet_Size, margins=True)
pd.crosstab(data.Item_Type, data.Outlet_Location_Type, margins=True)

pd.crosstab(data.Outlet_Identifier, data.Outlet_Type, margins=True)
pd.crosstab(data.Outlet_Identifier, data.Outlet_Size, margins=True)
pd.crosstab(data.Outlet_Identifier, data.Outlet_Location_Type, margins=True)

pd.crosstab(data.Outlet_Type, data.Outlet_Size, margins=True)
pd.crosstab(data.Outlet_Type, data.Outlet_Location_Type, margins=True)

pd.crosstab(data.Outlet_Size, data.Outlet_Location_Type, margins=True)

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Item_Fat_Content, margins=True)

<font color='blue'>**OBSERVATION :** The outlets that were estiblished in 1985 have more sales as compared to others</font>

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Item_Type, margins=True)

<font color='blue'>**OBSERVATION :** Most of the items sold are "Fruits and Vegetables" and "Snack Foods"</font>

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Identifier, margins=True)

<font color='blue'>**OBSERVATION :** Outlet OUT019 sells lesser number of items</font>

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Type, margins=True)

<font color='blue'>**OBSERVATION :** <br>
    "Supermarket Type1" sells most of the items<br>
    "Grocery Store" and "Supermarket Type3" were established in 1985 <br>
    "Supermarket Type2" was established in 2009<br>
</font>

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Size, margins=True)

<font color='blue'>**OBSERVATION :** Medium size outlets sells more items as compared to others</font>

In [None]:
pd.crosstab(data.Outlet_Establishment_Year, data.Outlet_Location_Type, margins=True)

<font color='blue'>**OBSERVATION :** More items are sold in Tier3 locations and less items are Tier2 locations</font>

In [None]:
pd.crosstab(data.Item_Fat_Content, data.Item_Type, margins=True)

<font color='blue'>**OBSERVATION :** <br>
    More "Low Fat" items are sold as compared Regular<br>
    Most of the "Low fat" items sold are "Household" items
</font>

In [None]:
pd.crosstab(data.Item_Fat_Content, data.Outlet_Identifier, margins=True)

<font color='blue'>**OBSERVATION :**</font>

In [None]:
pd.crosstab(data.Item_Fat_Content, data.Outlet_Type, margins=True)

<font color='blue'>**OBSERVATION :** The ratio of Low Fat verses Regular (2:1) is constant for all the Outlet types</font>

In [None]:
pd.crosstab(data.Item_Fat_Content, data.Outlet_Size, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Item_Fat_Content, data.Outlet_Location_Type, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Item_Type, data.Outlet_Identifier, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Item_Type, data.Outlet_Type, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Item_Type, data.Outlet_Size, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Item_Type, data.Outlet_Location_Type, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Outlet_Identifier, data.Outlet_Type, margins=True)

<font color='blue'>**OBSERVATION :** Outlet OUT019 are all Grocery stores</font>

In [None]:
pd.crosstab(data.Outlet_Identifier, data.Outlet_Size, margins=True)

<font color='blue'>**OBSERVATION :** Outlet OUT013 are outlets of Higher sizes</font>

In [None]:
pd.crosstab(data.Outlet_Identifier, data.Outlet_Location_Type, margins=True)

<font color='blue'>**OBSERVATION :** Outlets OUT035 only exist in Tier2 locations</font>

In [None]:
pd.crosstab(data.Outlet_Type, data.Outlet_Size, margins=True)

<font color='blue'>**OBSERVATION :** <br>
    Supermarket Type1 : They are of all sizes<br>
    Supermarket Type2 and Supermarket Type3 : They are only of Medium size<br>
    Grocery Store : They are only of Small sizes<br>
</font>

In [None]:
pd.crosstab(data.Outlet_Type, data.Outlet_Location_Type, margins=True)

<font color='blue'>**OBSERVATION :** </font>

In [None]:
pd.crosstab(data.Outlet_Size, data.Outlet_Location_Type, margins=True)