                              QUANTIUM TASK - 1
                Data Preparation and Customer Analytics
                                Code in Python

In [None]:
# Importing required libraries and making few changes for stylling.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msn
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

plt.style.use('dark_background')
plt.rcParams["image.cmap"] = "Set1"
plt.rcParams['figure.figsize'] = 18, 7
# plt.rcParams.keys()

In [None]:
#Read the data

purchase = pd.read_csv('../input/quantium-data-analytics-virtual-experience-program/PurchaseBehaviour.csv')
trans = pd.read_csv('../input/quantium-data-analytics-virtual-experience-program/Transactions.csv')

In [None]:
print(purchase.shape)
purchase.sample(5)

In [None]:
#Statistics for purchase DataFrame

purchase.describe(include = 'all')

In [None]:
purchase.info()

In [None]:
# Changed the object dtype to category dtype

purchase[['LIFESTAGE', 'PREMIUM_CUSTOMER']] = purchase[['LIFESTAGE', 'PREMIUM_CUSTOMER']].astype('category')

In [None]:
check = purchase.copy()      # just to see lylty column if it was a categorical column
check['LYLTY_CARD_NBR'] = check['LYLTY_CARD_NBR'].astype('category')
display(check.describe(include = 'all', ))
del check  

In [None]:
# Number of customers in the purchase dataframe

purchase['LYLTY_CARD_NBR'].nunique()

Therefore there are 72637 customers in the dataframe

In [None]:
# Graph to show the value counts of Lifestage column in purchase dataframe

(purchase['LIFESTAGE'].value_counts(normalize = True) * 100).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

Most of the customers in the dataframe are in Retiress lifestage and the second is Older singles/couples. We can see that elder members have more loyalty cards than other lifestage customers (Maybe there childrens gave them the card for shopping since they might not have a regular income).

In [None]:
# Graph to show the value counts of Premium customer in purchase dataframe

(purchase['PREMIUM_CUSTOMER'].value_counts() * 100).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

The customers having loyalty card are most mainstream customers > Budget > Premium

In [None]:
print(trans.shape)
trans.sample(5)

In [None]:
# Statistics of transaction dataframe

trans.describe(include= 'all')

In [None]:
trans.info()

In [None]:
# Code to convert DATE column to datetime dtype

import datetime

# import pandas as pd

# def convert_excel_time(excel_time):
#     '''
#     converts excel float format to pandas datetime object
#     round to '1min' with 
#     .dt.round('1min') to correct floating point conversion innaccuracy
#     '''
    
#     return pd.to_datetime('1899-12-30') + pd.to_timedelta(excel_time,'D')

def xldate_to_datetime(xldate):
	temp = datetime.datetime(1899, 12, 30)
	delta = datetime.timedelta(days=xldate)
	return temp+delta

trans['DATE'] = trans['DATE'].apply(xldate_to_datetime)
trans.head()

In [None]:
# Statistics of date column in trans dataframe

print(trans['DATE'].dtypes)
trans['DATE'].describe()

In [None]:
# days on which most transactions were done

(trans['DATE'].value_counts(normalize = True, ascending = False) * 100).head(10).plot(kind = 'bar', figsize = (22, 8))
plt.xticks(rotation = 0)
plt.show()

 We can clearly see that the most of the transactions are around christmas!

In [None]:
check = trans['DATE'].nunique()
print(f'number of days in the data: {check}')
del check

The dataframe contains 364 days. Therefore we have data of 1 year with one day missing (Maybe a government holiday/strike)

In [None]:
missing_dates = pd.date_range(start = '2018-07-01', end = '2019-06-30').difference(trans['DATE'])
print(f'This date is not present in DATE feature: {missing_dates}')

'It is christmas!!. The mall was closed on christmas

In [None]:
# Number of stores in the dataframe

no_of_stores = trans['STORE_NBR'].nunique()
print(f'The dataframe contains {no_of_stores} shops/stores.')

In [None]:
check = sorted(trans['STORE_NBR'].unique())
for num in range(min(check), max(check) + 1):
    if num in check:
         pass
    else:
        print(f'this store number is not present in data: {num}')

In [None]:
(trans['STORE_NBR'].value_counts() * 100).head(10).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

The store 226, 88 and 93 has lot of transactions. Must look into it further later, why these shops have so high transaction rate (may be they are on ground floor, or children freindly etc)

In [None]:
a = sorted(trans['LYLTY_CARD_NBR'].unique())

In [None]:
b = sorted(purchase['LYLTY_CARD_NBR'].unique())

In [None]:
if a == b:
    print('Both the dataframes contain same user card numbers')
else:
    print('There are some users present only in one dataframe')

In [None]:
trans['TXN_ID'].nunique()

In [None]:
trans['TXN_ID'].value_counts().head(10).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

In [None]:
print(trans['PROD_NBR'].nunique())

There are 114 different products in the dataframe. 

In [None]:
trans['PROD_NBR'].plot(kind = 'hist')
plt.show()

In [None]:
trans['PROD_NBR'].value_counts().head(10).plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

Product 102, 108 and 33 are most sold. The favorite product in the customer segment.

In [None]:
trans['PROD_NAME'].nunique()

In [None]:
# top 20 most sold products

most_sold_products = trans['PROD_NAME'].value_counts().head(20)
most_sold_products

In [None]:
print(f'The most sold products are {most_sold_products.index.values[0]}, {most_sold_products.index.values[1]}')

In [None]:
# Convert dtype of prod_name from object to category

trans['PROD_NAME'] = trans['PROD_NAME'].astype('category')

In [None]:
print(trans['PROD_QTY'].nunique())
print(trans['PROD_QTY'].unique())

In [None]:
check = pd.DataFrame(trans['PROD_QTY'].value_counts())
check['percentage'] = trans['PROD_QTY'].value_counts(normalize = True) * 100
display(check.head(6))
check['PROD_QTY'].plot(kind = 'bar')
del check

The customers love to buy 2 packets at once. 

In [None]:
trans[trans['PROD_QTY'] == 200]  #may be outliers 

In [None]:
trans[trans['LYLTY_CARD_NBR'] == 226000]     #the person who brought outliers

The person who brought outliers brought only outliers (he does not have any other transaction in the datset.) He might be a retailer buying in bulk.

In [None]:
trans['TOT_SALES'].nunique()

In [None]:
display(trans['TOT_SALES'].value_counts().head(10))
print(min(trans['TOT_SALES']), max(trans['TOT_SALES']))

Usually customer buys just 2 or 3 packs of chips from the store.

In [None]:
(trans['PROD_NAME'].value_counts(normalize = True, ascending = False) * 100).plot(kind = 'barh', figsize = (22,28))

In [None]:
df = pd.merge(trans, purchase, on = 'LYLTY_CARD_NBR')
display(df.describe(include = 'all'))

In [None]:
df.info()


In [None]:
df['LIFESTAGE'].value_counts().plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

In [None]:
df['PREMIUM_CUSTOMER'].value_counts().plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.show()

In [None]:
cd /content/drive/My Drive/Quantium Internship/Data

In [None]:
df.to_csv('df.csv', index = False)

In this notebook, I saw both the dataframes and changed few datatypes and later combined both into one dataframe called df on lylty card number.