# Analyse Datasets

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [2]:
# Import Datasets
df_or = pd.read_csv('datasets/OrderReport.csv')
print(df_or.head())

                  Date  Order # N. Revenue (formatted)      Status  \
0  2023-10-26 14:50:17    16698                RM57.80  processing   
1  2023-10-26 13:53:47    16697                RM32.90  processing   
2  2023-10-26 13:04:17    16696                RM32.90  processing   
3  2023-10-26 13:00:49    16695                RM82.70  processing   
4  2023-10-26 12:47:36    16694                RM32.90  processing   

                               Customer Customer type  \
0  AINAA SYAFIQAH ABD RAHMAN ABD RAHMAN           new   
1                              Ja Hasim           new   
2                             nasy anis           new   
3   Herny erdawati binti Mohamad Rashed           new   
4                      Suhaina Sulaiman           new   

                                          Product(s)  Items sold  Coupon(s)  \
0  1× LIP BOOSTER - WATERMELON, 1× LIP BOOSTER - ...           2        NaN   
1                             1× LIP BOOSTER - MELON           1        NaN   

In [3]:
df_pl = pd.read_csv('datasets/ProductList.csv')
print(df_pl.head())

                      Product / Variation title    SKU  Items sold  \
0                      LIP BOOSTER - WATERMELON  00597        2590   
1                      LIP BOOSTER - STRAWBERRY  00602        2563   
2  COLLAGEN SERUM FOUNDATION ALHA ALFA - MEDIUM  00608        1067   
3   COLLAGEN SERUM FOUNDATION ALHA ALFA - LIGHT  00607         728   
4                           LIP BOOSTER - MELON  00610         651   

   N. Revenue  Orders    Status  Stock  
0     63270.9    2299  In stock     11  
1     63768.9    2280  In stock    134  
2      7435.1    1005  In stock     33  
3      7135.7     690  In stock     41  
4     16209.9     587  In stock    177  


In [4]:
df_rr = pd.read_csv('datasets/RevenueReport.csv')
print(df_rr.head())

                  Date  Orders  Gross sales  Returns  Coupons  Net sales  \
0  2023-10-26 00:00:00      12        917.7      0.0      0.0      917.7   
1  2023-10-25 00:00:00      31       1664.7      0.0      0.0     1664.7   
2  2023-10-24 00:00:00      11        803.0      0.0      0.0      803.0   
3  2023-10-23 00:00:00      16        867.5      0.0      0.0      867.5   
4  2023-10-22 00:00:00      17        986.9      0.0      0.0      986.9   

   Taxes  Shipping  Total sales  
0    0.0     100.0       1017.7  
1    0.0     264.0       1928.7  
2    0.0      96.0        899.0  
3    0.0     136.0       1003.5  
4    0.0     144.0       1130.9  


In [5]:
# Find out the number of rows and columns in each dataset
print("OrderReport: ", df_or.shape)
print("ProductList: ", df_pl.shape)
print("RevenueReport: ", df_rr.shape)

OrderReport:  (8015, 11)
ProductList:  (111, 7)
RevenueReport:  (299, 9)


In [6]:
# Find out the number of missing values in each dataset
print("OrderReport: \n", df_or.isnull().sum())
print("ProductList: \n", df_pl.isnull().sum())
print("RevenueReport: \n", df_rr.isnull().sum())

OrderReport: 
 Date                         0
Order #                      0
N. Revenue (formatted)       0
Status                       0
Customer                     0
Customer type                0
Product(s)                   0
Items sold                   0
Coupon(s)                 8015
N. Revenue                   0
State                        0
dtype: int64
ProductList: 
 Product / Variation title    0
SKU                          2
Items sold                   0
N. Revenue                   0
Orders                       0
Status                       0
Stock                        0
dtype: int64
RevenueReport: 
 Date           0
Orders         0
Gross sales    0
Returns        0
Coupons        0
Net sales      0
Taxes          0
Shipping       0
Total sales    0
dtype: int64


In [7]:
# Find out the number of unique values in each dataset
print("OrderReport: \n", df_or.nunique())
print("ProductList: \n", df_pl.nunique())
print("RevenueReport: \n", df_rr.nunique())

OrderReport: 
 Date                      8010
Order #                   8015
N. Revenue (formatted)     212
Status                       2
Customer                  7240
Customer type                2
Product(s)                1115
Items sold                  16
Coupon(s)                    0
N. Revenue                 170
State                       16
dtype: int64
ProductList: 
 Product / Variation title    111
SKU                          109
Items sold                    50
N. Revenue                    81
Orders                        54
Status                         2
Stock                         48
dtype: int64
RevenueReport: 
 Date           290
Orders          80
Gross sales    281
Returns          1
Coupons          1
Net sales      281
Taxes            1
Shipping       118
Total sales    287
dtype: int64


In [8]:
# Find out the data types of each column in each dataset
print("OrderReport: \n", df_or.dtypes)
print("ProductList: \n", df_pl.dtypes)
print("RevenueReport: \n", df_rr.dtypes)


OrderReport: 
 Date                       object
Order #                     int64
N. Revenue (formatted)     object
Status                     object
Customer                   object
Customer type              object
Product(s)                 object
Items sold                  int64
Coupon(s)                 float64
N. Revenue                float64
State                      object
dtype: object
ProductList: 
 Product / Variation title     object
SKU                           object
Items sold                     int64
N. Revenue                   float64
Orders                         int64
Status                        object
Stock                          int64
dtype: object
RevenueReport: 
 Date            object
Orders           int64
Gross sales    float64
Returns        float64
Coupons        float64
Net sales      float64
Taxes          float64
Shipping       float64
Total sales    float64
dtype: object


In [9]:
# Find out the number of duplicate rows in each dataset
print("OrderReport: ", df_or.duplicated().sum())
print("ProductList: ", df_pl.duplicated().sum())
print("RevenueReport: ", df_rr.duplicated().sum())


OrderReport:  0
ProductList:  0
RevenueReport:  0


In [10]:
# remove columns with all null values
df_or = df_or.dropna(axis=1, how='all')
df_pl = df_pl.dropna(axis=1, how='all')
df_rr = df_rr.dropna(axis=1, how='all')

In [11]:
# print the head of each dataset
print("OrderReport: \n", df_or.head())
print("ProductList: \n", df_pl.head())
print("RevenueReport: \n", df_rr.head())


OrderReport: 
                   Date  Order # N. Revenue (formatted)      Status  \
0  2023-10-26 14:50:17    16698                RM57.80  processing   
1  2023-10-26 13:53:47    16697                RM32.90  processing   
2  2023-10-26 13:04:17    16696                RM32.90  processing   
3  2023-10-26 13:00:49    16695                RM82.70  processing   
4  2023-10-26 12:47:36    16694                RM32.90  processing   

                               Customer Customer type  \
0  AINAA SYAFIQAH ABD RAHMAN ABD RAHMAN           new   
1                              Ja Hasim           new   
2                             nasy anis           new   
3   Herny erdawati binti Mohamad Rashed           new   
4                      Suhaina Sulaiman           new   

                                          Product(s)  Items sold  N. Revenue  \
0  1× LIP BOOSTER - WATERMELON, 1× LIP BOOSTER - ...           2        49.8   
1                             1× LIP BOOSTER - MELON         

In [12]:
# for the OrderReport dataset, multiple product bought is in  one same column called "Product", so we need put it in a list
df_or['Product(s)'] = df_or['Product(s)'].apply(lambda x: x.split(','))
print(df_or.head())

                  Date  Order # N. Revenue (formatted)      Status  \
0  2023-10-26 14:50:17    16698                RM57.80  processing   
1  2023-10-26 13:53:47    16697                RM32.90  processing   
2  2023-10-26 13:04:17    16696                RM32.90  processing   
3  2023-10-26 13:00:49    16695                RM82.70  processing   
4  2023-10-26 12:47:36    16694                RM32.90  processing   

                               Customer Customer type  \
0  AINAA SYAFIQAH ABD RAHMAN ABD RAHMAN           new   
1                              Ja Hasim           new   
2                             nasy anis           new   
3   Herny erdawati binti Mohamad Rashed           new   
4                      Suhaina Sulaiman           new   

                                          Product(s)  Items sold  N. Revenue  \
0  [1× LIP BOOSTER - WATERMELON,  1× LIP BOOSTER ...           2        49.8   
1                           [1× LIP BOOSTER - MELON]           1        24.9