In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

from glob import glob

import re
import gc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

import plotly.express as px #Plotly Express

from plotly.offline import iplot

#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

import plotly.io as pio
pio.templates.default = 'plotly_white'

import itertools
import collections
from collections import Counter

from nltk.corpus import stopwords

import re
from wordcloud import WordCloud

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set3')

import os
print(os.listdir('../input/online-retail/'))

import warnings
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv('../input/online-retail/Online_Retail.csv')
print(df.shape)
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

- Let's check the missing values

In [None]:
missing = df.isna().sum().reset_index()
missing.columns = ['features', 'total_missing']
missing['percent'] = (missing['total_missing'] / len(df)) * 100
missing.index = missing['features']
del missing['features']

missing['total_missing'].iplot(kind = 'bar', 
                               title = 'Missing Values Plot in Dataset',
                               xTitle = 'Features',
                               yTitle = 'Count')
missing.T

- 25% of CustomerID is missing

Create a feature for total purchase price

In [None]:
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

In [None]:
sns.distplot(df['TotalAmount'], bins = 10);

In [None]:
print(f"Number of unique StockCode: {df['StockCode'].nunique()}")

In [None]:
df['StockCode'].value_counts()[:50].sort_values(ascending = True).iplot(kind = 'bar', 
                                                                  orientation = 'h',
                                                                  yTitle = 'Stock Code',
                                                                  title = 'Countplot of StockCode')

StockCode: 85123A is highest selling

In [None]:
df[df['StockCode'] == '85123A']['Description'][0]

__Quantity__

In [None]:
sns.distplot(df['Quantity'], bins = 10);

In [None]:
print(f"Number of unique CustomerID: {df['CustomerID'].nunique()}")

In [None]:
print(f"Number of unique Country: {df['Country'].nunique()}")

In [None]:
df['Country'].value_counts()[:15].sort_values(ascending = True).iplot(kind = 'bar', 
                                                                  orientation = 'h',
                                                                  yTitle = 'Country',
                                                                  title = 'Countplot of Country')

United Kingdom is where the most number of transactions has happened

In [None]:
print(f"Number of unique Invoice numbers: {df['InvoiceNo'].nunique()}")

In [None]:
df.groupby('InvoiceNo')['TotalAmount'].sum()

__There are values that are in negative__

In [None]:
df.groupby('InvoiceNo')['TotalAmount'].sum().plot(kind = 'box');

In [None]:
df[df['Quantity'] < 0]

- These could be wrong entries that can be removed from the dataset

In [None]:
def plot_wordcloud(data, col, text = None):
    stop = stopwords.words('english')
    all_words = [word for each in data[col] for word in str(each).lower().split() if word not in stop]
    word_freq = Counter(all_words)

    wordcloud = WordCloud(width = 900,
                          height = 500,
                          max_words = 200,
                          max_font_size = 100,
                          relative_scaling = 0.5,
                          background_color = "rgba(255, 255, 255, 0)", 
                          mode = "RGBA",
                          normalize_plurals = True).generate_from_frequencies(word_freq)
    plt.figure(figsize = (16, 12))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.title(text, fontsize = 16)
    plt.axis("off")
    plt.show()

In [None]:
plot_wordcloud(df, 'Description', 'WordCloud of Product Description')

In [None]:
print(f"Number of Unique product descriptions: {df['Description'].nunique()}")
print(f"Number of Unique product descriptions: {df['StockCode'].nunique()}")

- There is a mismatch between number of unique number of StockCodes and Descriptions - 4070 unique StockCodes

In [None]:
stockcode_des = df.groupby('StockCode')['Description'].count()
stockcode_des = stockcode_des[stockcode_des.values > 0]
print(f"There are {len(stockcode_des)} StockCode with more than 1 Description")

In [None]:
for i, sc in enumerate(stockcode_des.index):
    print(sc, '-', df[df['StockCode'] == sc]['Description'].isna().sum())
    print(df[df['StockCode'] == sc]['Description'].unique())
    if i > 5: break

- Looks like mostly there is 'nan' in place of the Description for a partucular StockCode
- From above we see that we have to impute NaNs in Description based on StockCode

In [None]:
for i, sc in tqdm_notebook(enumerate(stockcode_des.index)):
    if df[df['StockCode'] == sc]['Description'].isna().any():
        temp = df[df['StockCode'] == sc]['Description']
        fill = temp.value_counts().index[0]
        df['Description'].loc[temp.index] = df['Description'].loc[temp.index].fillna(fill)
    if len(df[df['StockCode'] == sc]['Description'].unique()) > 1:
        temp = df[df['StockCode'] == sc]['Description']
        un = temp.unique()
        repl = un[0]
        df['Description'].loc[temp.index] = df['Description'].loc[temp.index].replace(un[1:], repl)

In [None]:
#Check
for i, sc in enumerate(stockcode_des.index):
    print(sc, '-', df[df['StockCode'] == sc]['Description'].isna().sum())
    print(df[df['StockCode'] == sc]['Description'].unique())
    if i > 5: break

In [None]:
#Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], infer_datetime_format = True)
df['InvDoW'] = df['InvoiceDate'].dt.dayofweek #Monday - 0
df['InvMonth'] = df['InvoiceDate'].dt.month
df['InvYear'] = df['InvoiceDate'].dt.year
df['InvHour'] = df['InvoiceDate'].dt.hour
df['InvDay'] = df['InvoiceDate'].dt.day

In [None]:
top_selling = df.groupby('Description')['TotalAmount'].sum().sort_values(ascending = False)[:30]

top_selling.iplot(kind = 'bar', 
                  orientation = 'h',
                  yTitle = 'Product Description',
                  title = '30 Top Selling Products in terms of Sales',
                    )

In [None]:
df.groupby('Description')['TotalAmount'].sum().sort_values(ascending = False)[-10:].iplot(kind = 'bar', 
                                                                  orientation = 'v',
                                                                  yTitle = 'Product Description',
                                                                  title = 'Negative Sales - Charges/Discounts/Samples')

In [None]:
df.groupby('InvMonth')['TotalAmount'].sum().iplot(kind = 'bar', 
                                                  yTitle = 'Sales',
                                                  title = 'Total Sales by Invoice Month'
                                                 )

- Sales has crossed 1M in the last 4 months of the year with November being the top month

In [None]:
df.groupby('InvDoW')['TotalAmount'].sum().iplot(kind = 'bar', 
                                                  yTitle = 'Sales',
                                                  title = 'Total Sales by Invoice Day of Week'
                                                 )

- Saturday seems to off day for the stores

In [None]:
df.groupby('InvHour')['TotalAmount'].sum().iplot(kind = 'bar', 
                                                  yTitle = 'Sales',
                                                  title = 'Total Sales by Hour of the Day'
                                                 )

- The stores are open between 6 am and 9 pm

In [None]:
top_month = pd.pivot_table(data = df[df['Description'].isin(top_selling[:5].index)], index = ['InvMonth'], 
               columns = 'Description', values = 'Quantity', aggfunc = 'sum', fill_value = 0)

layout1 = cf.Layout(
                    height = 600,
                    width = 1100
                    )
top_month.iplot(kind = 'bar', yTitle = 'Quantity Sold', 
               title = 'Total Quantity Sold by Month of top 5 products', 
               )

- 'DOTCOM POSTAGE' has high UnitPrice 

In [None]:
print(f"Avg. UnitPrice of DOTCOM POSTAGE: {df['UnitPrice'][df['Description'] == 'DOTCOM POSTAGE'].mean()}")
print(f"Avg. UnitPrice of JUMBO BAG RED: {df['UnitPrice'][df['Description'] == 'JUMBO BAG RED RETROSPOT'].mean()}")

In [None]:
df.head(2)

# WIP..