In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import plotly.express as px
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected = True)

import cufflinks as cf

# Use Plotly locally
cf.go_offline()

# Import the libraries

In [None]:
df = pd.read_csv("/kaggle/input/onlineretail/OnlineRetail.csv", encoding= 'unicode_escape')
df.head()

## Information about the column type

In [None]:
df.info()

Customer ID and description has missing values. Invoice date is present as object type instead of date time.

### Checking total null values

In [None]:
df.isnull().sum()

#### Drop column

In [None]:
df = df.drop(columns=['CustomerID'])

### Analysing Description Column

In [None]:
df['Description'] = df['Description'].fillna('No Description')

### Checking null values once again

In [None]:
df.isnull().sum()

## Changing Date column type

In [None]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

## Statistical description of data set

In [None]:
df.describe().T

Here we see that Quantity and Unit price are in negative. This has to be seen later as these values should ideally be positive in nature.

## Splitting Invoice Date column into year, month, date and time column seperately

In [None]:
df["Year"] = (df["InvoiceDate"]).dt.year
df["Month"] = (df["InvoiceDate"]).dt.month
df["Day"] = (df["InvoiceDate"]).dt.day
df["Time"] = df["InvoiceDate"].dt.strftime('%H:%M')
df.head()

## Segregate day column as weekend or weekday

In [None]:
df['DayName'] = df['InvoiceDate'].dt.day_name()

In [None]:
df.DayName.value_counts()

In [None]:
df["DayNum"] = df['InvoiceDate'].dt.dayofweek
df["IsWeekend"] = (df["DayNum"] >= 5).astype(int)
df.head()

## Segregate time column as office time or free time

In [None]:
end_time = '17:00'
start_time = '09:00'
df["OfficeTime"] = ((df['Time'] > start_time) & (df['Time'] < end_time)).astype(int)

## Checking negative value

### Quantity

Assuming that quantity should not be negative we will just remove the "-" sign from the quantity value.

In [None]:
df['Quantity'] = df['Quantity'].astype(str).str.replace('-', '').astype(int)

### Unit Price

In [None]:
df['UnitPrice'] = df['UnitPrice'].astype(str).str.replace('-', '').astype(float)

## Creating Sale Column

In [None]:
df["Sale"] = df["UnitPrice"] * df["Quantity"]

# EDA

## Popular Countries 

In [None]:
fig = px.histogram(df, x = "Country")
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    xaxis = dict(
        title_text = "<b> Country </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b>Total orders placed</b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Popular country to order online products </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()


United Kingdom is the highest purchaser compared to others. Let us create two dataframe one of UK and other without it.

In [None]:
df_uk = df.loc[df["Country"] == "United Kingdom"]
df_uk.shape

In [None]:
df_no_uk = df.loc[df["Country"] != "United Kingdom"]
df_no_uk.shape

In [None]:
# Collections for each dataset
datasets = [df_uk, df_no_uk]
names = ['df_uk', 'df_no_uk']

# Creating a DataFrame with useful information about all datasets
data_info = pd.DataFrame({})
data_info['dataset'] = names
data_info['n_rows'] = [df.shape[0] for df in datasets]
data_info['n_cols'] = [df.shape[1] for df in datasets]
data_info['null_amount'] = [df.isnull().sum().sum() for df in datasets]
data_info['qty_null_columns'] = [len([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]
data_info['null_columns'] = [', '.join([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]

data_info.style.background_gradient()

In [None]:
total_percent_uk = (df_uk.shape[0]/df.shape[0]) * 100
print("UK percentage of total data", total_percent_uk)

From above we can see that UK greater than 90% of total data

# Non UK Countries

## Check total sale in week in Non-UK

In [None]:
df_sale = df_no_uk.groupby(["Country","OfficeTime", "IsWeekend"]).agg({
    "Sale" : "sum"
}).reset_index()

In [None]:
fig = px.histogram(df_sale, x = "Country",y = "Sale", color = "OfficeTime", facet_col = "IsWeekend",
                  color_discrete_map={'No': '#000080', 
                                      'Yes': '#FF9933'
                  },
                  )
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    
    yaxis = dict(
        title_text = "<b>Total orders placed</b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Orders placed </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black",
                title_text = "<b>Country</b>", titlefont = dict(size = 12))

fig.show()





From the above graph it seems maximum products are ordered during office time and in week days.

## Popular items according to description Non-UK

In [None]:
df_des = df_no_uk.groupby(["Description"]).agg({
    "Description": "count",
    "Quantity": "sum"
})

In [None]:
df_des.rename(columns = {
    "Description" : "Count"
}, inplace = True)

In [None]:
df_des_c = df_des.sort_values("Count", ascending = False)
df_des_c

Let us exclude the "POSTAGE" and plot other items.

In [None]:
fig = px.bar(df_des_c[1:16], x = df_des_c[1:16].index, y = "Count")
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize = False,
    width = 1000,
    height = 400,
    xaxis = dict(
        title_text = "<b> Item Name </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b> Item Count </b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Count of each item </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

## Which Quantity is maximum according to description

In [None]:
df_des_q = df_des.sort_values("Quantity", ascending = False)
df_des_q

In [None]:
fig = px.bar(df_des_q[:15], x = df_des_q[:15].index, y = "Quantity")
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize = False,
    width = 1000,
    height = 400,
    xaxis = dict(
        title_text = "<b> Item Name </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b> Quantity Count </b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Most Popular item according to quantity </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

## In which month maximum shopping is done

In [None]:
df_month = df_no_uk.groupby(["Month",  "Country"])[["StockCode", "Sale"]].agg({
    "StockCode":"count",
    "Sale":"sum"
}).reset_index()
df_month

In [None]:
fig = px.bar(df_month, x = "Month", y = "StockCode", color_discrete_sequence= ["#EE4B2B"] * len(df_month))
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize = False,
    width = 1000,
    height = 400,
    xaxis = dict(
        title_text = "<b> Month Number </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b> StockCode Count </b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Total count of StockCode for each month </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

## Which month has highest sale

In [None]:
fig = px.bar(df_month, x = "Month", y = "Sale", color_discrete_sequence = ["#0047ab"] * len(df_month))
fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize = False,
    width = 1000,
    height = 400,
    xaxis = dict(
        title_text = "<b> Month Number </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b> Total Sale </b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Month according to their total sale </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

In October the sale is maximum and also the stock code count is highest.

## Which day is most popular in which country

In [None]:
df_con = df_no_uk.groupby(["Country", "DayName"]).agg({
    "DayName": "count"
})
df_con.rename(columns = {
    "DayName": "DayNameCount"
}, inplace = True)

df_con = df_con.reset_index()
df_con

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(x = df_con["DayName"], y =df_con["DayNameCount"] ,
                      marker=dict(color="#ab0047"),
                    name = "",
                    hovertemplate =
                            df_con["Country"]+
                            '<br>'+
                            '%{y}',
                    
                        ))

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    title = "<b>Shopping according to days in different countries</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
     xaxis = dict(
        title_text = "<b> Days </b>",
        titlefont = dict(size = 12),
        categoryorder = 'total descending'
    ),
    yaxis = dict(
        title_text = "<b> Day Count </b>",
        titlefont = dict(size = 12),
    ),
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")


fig.show()

## Which country has how much sale

In [None]:
df_con = df_no_uk.groupby("Country").agg({
    "Sale":"sum",
    "Quantity":"sum"
}).reset_index()
df_con

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_con["Country"]
y1 = df_con["Sale"]
y2 = df_con["Quantity"]

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x = x, y = y1, mode = "lines+markers", name = "Sale",
                        line=dict(color='#EC2781', width=2)), secondary_y=False,)
fig.add_trace(go.Scatter(x = x, y = y2, mode = "lines+markers", name = "Quantity",
                        line=dict(color='#3C8DD6', width=2)), secondary_y=True,)



fig.update_layout(
    plot_bgcolor = "#ECECEC",
    title = "<b>Total quantity purchased and total sale for each country</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
     xaxis = dict(
        title_text = "<b> Countries </b>",
        titlefont = dict(size = 12)
    ),
    
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Sale</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Quantity</b>", secondary_y=True)

fig.show()


## Is the product with less unit price ordered more

In [None]:
df_p = df_no_uk.groupby("UnitPrice").agg({
    "Quantity":"sum"
}).reset_index()
df_p

From above table we see that the most ordered product have unit price zero. It is not possible in real time situations. Hence we can conclude that:<br>
1. Entry is wrong.
2. Some offer is applicable on those products

In [None]:
fig = px.scatter(x = df_p["UnitPrice"], y =df_p["Quantity"],  log_x=True )

fig.update_layout(
    plot_bgcolor = "#ECECEC",
    title = "<b>Total Quantity of products purchased according to their unit price</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
     xaxis = dict(
        title_text = "<b> Unit Price </b>",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "<b> Total Quantity </b>",
        titlefont = dict(size = 12)
    ),
    
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

## Which country order cheap products more

In [None]:
df_con1 = df_no_uk.groupby([ "Country", "UnitPrice"]).agg({
    "UnitPrice":"count"
}).rename(columns = {
    "UnitPrice":"TotalUnit"
}).reset_index()


In [None]:
fig = px.bar(df_con1, x = df_con1["Country"], y = df_con1["UnitPrice"], color = df_con1["UnitPrice"],
             hover_data= ["TotalUnit"], log_y = True, color_continuous_scale=px.colors.sequential.Plasma)


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    title = "<b>Items purchased by countries according to their unit price</b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20,
     xaxis = dict(
        title_text = "<b> Countries </b>",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "<b> Count of Units </b>",
        titlefont = dict(size = 12)
    ),
    
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.show()

# UK

## Popular time, day and sale relation UK

In [None]:
df_uk["Time"] = pd.to_datetime(df_uk["Time"])

In [None]:
fig = px.scatter(df_uk, x="Time", y="Quantity",  facet_row = "DayName",color = "DayName",
             color_discrete_map={
                "Sunday":"#f032e6",
                "Monday":"#e6194B",
                "Tuesday":"#f58231",
                "Wednesday":"#ffe119",
                "Thrusday":"#bfef45",
                "Friday":"#3cb44b",
             })

for annotation in fig.layout.annotations:
    annotation.text = ""

# hide subplot y-axis titles titles
for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''
        

fig.update_layout(title = "<b>Distribution of sale each day according to time and quantity</b>",
                 plot_bgcolor = "#ECECEC")

fig.show()

In [None]:
df_mon = df_uk.groupby(["Month", "DayName", "OfficeTime"]).agg({
    "Quantity":"sum",
    "Sale":"sum",
    
}).reset_index()
df_mon

## Total Sales in UK in 2011

In [None]:
df_sale = df_uk[df_uk["Year"] == 2011].groupby("Month").agg({
    "Sale":"sum"
}).reset_index()
df_sale

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = df_sale['Month'], y =df_sale["Sale"] ,
                    mode='lines',
                    
                    line=dict(color='firebrick', width=2)
                        ))


fig.update_layout(
    plot_bgcolor = "#ECECEC",
    title = "<b>Sales in 2011</b>",
    xaxis_title = "Country",
    yaxis_title = "Total Sale")

fig.show()

## Total sales according to product description

In [None]:
df_uk_des = df_uk.groupby("Description").agg({
    "Quantity":"sum"
}).sort_values(by=['Quantity'], ascending=False).reset_index()
df_uk_des[:10]

In [None]:
fig = px.bar(df_uk_des[:10], x = "Description", y = "Quantity",
             color_discrete_sequence= ["#ff6b00"] * len(df_uk_des)
            )
fig.update_layout(
        plot_bgcolor = "#ECECEC",
        title = "</b>Top 10 most ordered items in Uk</b>",
        yaxis_title = "Total Items",
        xaxis_title = "Item Description")

fig.show()

## Sale in 2011 according to DayName and OfficeTime

In [None]:
df_uk_sale1 = df_uk[df_uk["Year"] == 2011].groupby(["DayName", "OfficeTime"]).agg({
    "Sale":"sum"
}).reset_index()
df_uk_sale1

In [None]:
fig = px.histogram(df_uk_sale1, x = "DayName", y = "Sale", color="DayName",
                    color_discrete_sequence=px.colors.qualitative.G10,
)
fig.update_layout(
        plot_bgcolor = "#ECECEC",
        title = "</b>Total Sales each day in Uk</b>",
        yaxis_title = "Total Sale",
        xaxis_title = "Day")
fig.show()

In [None]:
fig = px.bar(df_uk_sale1, x = "OfficeTime", y = "Sale")
fig.update_layout(
        plot_bgcolor = "#ECECEC",
        title = "</b>Distribution of sales according to office time</b>",
        yaxis_title = "Total Sale",
        xaxis_title = "Office Time")
fig.show()

This notebook tries to generate as many questions possible for the given dataset. Instead of focusing on data visualization I have tried to ask questions from the dataset. <br>
If you have some other questions which can also be asked please write in comment section. <br>
Also I will be publishing a topic modelling notebook based on description section of the dataframe. If you have any suggestions or preffered libraries of NLP do comment!!