In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import re

%matplotlib inline
pd.set_option('display.max_columns', None)  # to display all rows and columns while printing.
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1)

# plt.style.use("dark_background") # to change the default values of plt to our interest.
params = {'legend.fontsize': '15',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

# plt.rcParams["image.cmap"] = "Set1"

# to change default color cycle
# plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Set1.colors)
plt.rcParams['figure.figsize'] = 15, 8

In [None]:
df = pd.read_csv('../input/quantium-cleaned-dataset/df.csv', parse_dates=['DATE'])
df.columns = df.columns.str.lower()
print(df.shape)
df.sample(5)

In [None]:
df.describe(include='all')

In [None]:
cat_cols = ['store_nbr', 'lylty_card_nbr', 'PROD_NBR', 'prod_name', 'PROD_QTY', 'LIFESTAGE', 'PREMIUM_CUSTOMER']
num_cols = ['TXN_ID', 'TOT_SALES']

In [None]:
corrmat = df.corr()
mask = np.triu(np.ones_like(corrmat, dtype=bool))
plt.subplots(figsize = (22, 8))
sns.heatmap(corrmat, mask = mask, cmap = 'coolwarm') # no useful correlation
plt.show()

There is no correlation between variables.

We'll drop the 2 outliers which we detected earlier.

In [None]:
index_of_outliers = df[df['lylty_card_nbr'] == 226000].index
df = df.drop(index_of_outliers)
print('Outliers Removed!')

## Now lets check out, does we have only chips or some other products too. If we have products other than chips we'll drop them.

In [None]:
df['prod_name'].nunique()

There are 114 different kinds of product in the dataframe.

In [None]:
df['prod_name'].unique()

1. In one of the product Grain Waves company is written as GrnWves (GrnWves Plus Btroot & Chilli Jam 180g).
2. Natural chip company is written as NCC, Natural Chip Co, Natural ChipCo
3. 'RRD' company is also written as 'Red Rock Deli'
4. Smiths company is written as Smith also.
5. Sunbites whhlegrn is also written as Snbts Whlgrn.
5. Woolworths is also written as WW

In [None]:
prod_company = ['Burger Rings', 'CCs', 'Cheetos', 'Cheezels', 'Cobs Popd', 'Doritos',
                'French Fries', 'Grain Waves', 'Infuzions', 'Kettle', 'Natural Chip',
                'Pringles', 'RRD', 'Smiths', 'Sunbites Whlegrn', 'Thins', 'Tostitos',
                'Twisties', 'Tyrrells', 'Woolworths']

other_than_chips = ['Doritos Salsa       Medium 300g', 'Doritos Salsa Mild  300g',
                    'Old El Paso Salsa   Dip Chnky Tom Ht300g',
                    'Old El Paso Salsa   Dip Tomato Med 300g',
                    'Old El Paso Salsa   Dip Tomato Mild 300g',
                    'Woolworths Medium   Salsa 300g',
                    'Woolworths Mild     Salsa 300g']

In [None]:
# code to drop non-chip product samples in dataset df.

print(f'Data shape before dropping: {df.shape}\n')

for a in other_than_chips:
    other_than_chips_index = df[df['prod_name'] == a].index
    df = df.drop(other_than_chips_index)
    print(f'{a}  dropped!!')

print(f'\nData shape after dropping: {df.shape}')
no_unique = df['prod_name'].nunique()
print(f'\nNumber of products in the dataframe: {no_unique}')

We have dropped all the samples which are non-chips transactions in the dataframe.

In [None]:
sorted(df['prod_name'].unique())

In [None]:
def find_weight(text):    # func to extract weight of the chips packet sold
    num = re.findall(r'[0-9]+',text)
    return " ".join(num)
df['prod_wt'] = df['prod_name'].apply(lambda x: find_weight(x))
df.head()

Added a new column PROD_WT by extracting weight from PROD_NAME.

In [None]:
print(df['prod_wt'].nunique())
df['prod_wt'].value_counts().plot(kind = 'bar')
plt.title('Product Weight Distribution')
plt.xlabel('Product Weight')
plt.ylabel('Frequency')
plt.xticks(rotation = 0)
plt.show()

The column PROD_NAME consists of multiple spaces, numbers, Title identation, irregular company names. We'll clean the dataset with the below function.

In [None]:
def string_cleaning(dataframe): # function to clean the PROD_NAME column and make names consistent
    dataframe['prod_name'] = (dataframe['prod_name'].str.replace(r'[0-9]+[g|G]', '').
                              str.strip().str.replace(r'\s{2,}', ' ').str.lower())
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('dorito ', 'doritos ')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('french fries', 'french_fries')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('grnwves', 'grain_waves')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('grain waves', 'grain_waves')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('infzns', 'infuzions')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('natural chip co', 'natural_chipco')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('natural chip compny', 'natural_chipco')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('natural chipco', 'natural_chipco')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('ncc', 'natural_chipco')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('natural_chipcompny', 'natural_chipco')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('red rock deli', 'red_rock_deli')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('rrd', 'red_rock_deli')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('smith ', 'smiths ')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('snbts', 'sunbites')
    dataframe['prod_name'] = dataframe['prod_name'].str.replace('woolworths', 'ww')

    return dataframe

df = string_cleaning(df)
df.head()

In [None]:
df['prod_name'].nunique()

In [None]:
sorted(df['prod_name'].unique())

The PROD_NAME column is cleaned now. The first word in every sample is name of the company. So we'll create another column which consists the name of the company.

In [None]:
def extract_company_name(text):  # function to extract company name
    name = text.split()[0]
    return name

df['prod_comp'] = df['prod_name'].apply(extract_company_name)
df.sample(5)

We will also create another column of price of each chips packet, which can be extracted from TOT_SALES and PROD_QTY.

In [None]:
# new column PROD_PRICE

df['prod_price'] = df['tot_sales']/df['prod_qty']
df['prod_price'] = df['prod_price'].round(2)

Metrics for Data Analysis:


1.   On Which days does the chips are sold in huge quantity?
2.   Is there any particular season where the chips are sold in huge quantity?

1.   Which shops contribute for more sales of chips?
2.   which customer usually buys more chips (Customer to look out for)? 

1.   Which chip is most loved by the customers?
2.   How many packs does the customers often buy? (maybe digout why)

1.   How much money does the shopkeeper make from each customer?
2.   What is the average total sales?

1.   When does total sale seem to increase?
2.   Does the customers buy expensive chips? if yes how much?

1.   What is the driving force for total sales?
2.   Which type of customers seem to buy more?

1.   Does people love to buy larger packs or they are happy with smalller packs?
2.   Does the consumers have some favorite brand that they prefer to buy always?

1.   Does price of the product play a role in selling trends?
2.   Does many different kinds of products from the same brand play a role in purchase behaviour?

















In [None]:
df.groupby(df['date'])['tot_sales'].sum().plot(figsize = (18, 11))
loc, labels = plt.xticks()
x_ticks = ['July', 'September', 'November', 'January', 'March', 'May', 'July']
plt.xticks(loc, x_ticks, rotation = 0)
plt.title('Total sales through time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()

We can see from the graph that the 'Total Sales' is high during the month of December and low in the month of August and May. The total sales are high during the month of December maybe because it's the month of Christmas! We can say that we can expect huge sales in the month of december.

In [None]:
# shops selling the most of chips

tot_sales_in_each_store = df.groupby('store_nbr')['tot_sales'].sum().sort_values(ascending = False)
tot_sales_in_each_store.head(10).plot(kind = 'bar')
plt.title('Top shops w.r.t Total Sales')
plt.xticks(rotation = 0)
plt.xlabel('Store Number')
plt.ylabel('Total Sales')
plt.show()

Shop no. 226, 88, 165 and 237 are the shops selling the most chips.

In [None]:
card_numbers_of_high_purchase_customers = df.groupby(['lylty_card_nbr'])['tot_sales'].sum().sort_values(ascending = False)
card_numbers_of_high_purchase_customers.head(10).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.title('Customers who purchased a lot of chips')
plt.xlabel('Customer Loyality Card Number')
plt.ylabel('Total Sales')
plt.show()

In [None]:
df.groupby('lifestage')['tot_sales'].sum().sort_values(ascending = False).plot(kind = 'bar')
plt.xticks(rotation = 45)
plt.show()

From the plot above it is clear that older consumers are the consumers to look out for. They are driving force for total sales.

In [None]:
prod_name_tot_sales = df.groupby('prod_name')['tot_sales'].sum()
prod_name_tot_sales.sort_values(ascending = False, inplace = True )
prod_name_tot_sales.head(10).plot(kind = 'bar')
plt.xticks(rotation = 45)
plt.title('Product sold the most')
plt.xlabel('Product Name')
plt.ylabel('Total Sales')
plt.show()

Doritos corn chip supreme, Smiths crankle chip original big bag, Smiths crinkle chips salt and vinegar, Kettle mozzarella basil and pesto are the most sold chips in the dataset.

In [None]:
df.groupby('lylty_card_nbr')['tot_sales'].mean().sort_values(ascending = False).head(10).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

In [None]:
df.groupby('lylty_card_nbr')['tot_sales'].sum().sort_values(ascending = False).describe()

From the above plots and data, we can say that on average each shop earns around 25-27 on average from each customer in a year.

In [None]:
df.groupby('prod_price')['tot_sales'].sum().plot(kind = 'bar', figsize = (18, 8), rot = 60).invert_xaxis()
plt.title('Affect of Product Price on Sales')
plt.xlabel('Product Price in $')
plt.ylabel('Total Sale')
plt.show()

We can see from the plot that the customers are not interested in buying chips which are cheap. People usually prefer chips which are moderate or expensive in the store.

In [None]:
df.groupby('premium_customer')['tot_sales'].sum().sort_values(ascending = False).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

Mainstream customers tends to buy more than any other customer.

In [None]:
df.groupby('prod_wt')['tot_sales'].sum().sort_values(ascending = False).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.title('Product Weight influence on sales')
plt.xlabel('Product Weight')
plt.ylabel('Total Sales')
plt.show()

From the above plot we can say that there is no clear trend in product weight. But the consumers tends to buy more in the range of 150-180g.

In [None]:
check = df.copy()
check['prod_comp'] = check['prod_comp'].str.title()
check.groupby('prod_comp')['tot_sales'].sum().sort_values(ascending = False).plot(kind = 'bar')
plt.xticks(rotation = 45)
plt.title('Influence of Company Brand on Total Sales')
plt.xlabel('Brand')
plt.ylabel('Total Sales')
plt.show()

Kettle, Doritos and Smiths seem to sell more than any other brands. This can also be because these brands have many flavors of chips in the market.

Let's look at the most selling brand a little deeper and dig out their success mantra!

In [None]:
kettle_df = df.groupby('prod_comp').get_group('kettle')
kettle_df.head()

In [None]:
(kettle_df['prod_wt'].value_counts(normalize = True) * 100).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.title('Product Weight distribution of Kettle chips')
plt.xlabel('Weight in grams')
plt.ylabel('Percentage of prod weight')
plt.show()

From the above figure we can see that in all their products they consist of 150, 175g packs of around 90% in their total products. As we saw early customers likes to buy products whose weight ranges from 150-180g.

In [None]:
kettle_df['prod_price'].value_counts().sort_values(ascending = False).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.title('Product Price distribution of Kettle chips')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

Here too the Kettle has spotted the sweet spot, most of their products lie in range of 4.0 to 5.5. Therefore due to these reasons the Kettle chips are most sold.

Therefore we can conclude that:

1.   Total sales is at peak in the month of December. Most of the transactions are done in this month.
2.   Doritos corn chip supreme, Smiths crankle chip original big bag, Smiths crinkle chips salt and vinegar, Kettle mozzarella basil and pesto are the chips driving total sales in the dataset.

1.   We can say that on average each shop earns around 25-27 on average from each customer in a year.
2.   Most of the customers in the dataset doesn't buy chips which are cheap. Therefore the shopkeeper can remove the cheap products in the shop.

1.   The shop must focus on mainstream consumers rather than premium consumers since they just occupy small portion of the sales.
2.   Older consumers are driving force for the shops around 60% of sales are from these consumers. So the shopkeeper must focus on these consumers.

1.   Customers tend to buy chip packets of weight 150-180g
2.   The most loved brand is Kettle, Dortitos, Smiths.

1.   If the shopkeeper wants to sell other products, he must have variety of those brands to make a change in the market.

## Thank You











In [None]:
cd /content/drive/My Drive/Quantium Internship/Data

In [None]:
df.to_csv('df1.csv', index = False)