# MAR21 - Data Cleaning

## Data Gather

### Importing libraries and load the datasets

In [1]:
from datetime import datetime
import pandas as pd
from openpyxl import Workbook
import numpy as np

In [2]:
#  Load the datasets
df = pd.read_excel(r"..\..\data\gatherData\MAR21_DimsumTJOEAN.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DATE        83 non-null     datetime64[ns]
 1   DEBIT       7 non-null      object        
 2   CREDIT      89 non-null     object        
 3   RECIPIENT   87 non-null     object        
 4   30          48 non-null     float64       
 5   20          20 non-null     float64       
 6   10          20 non-null     float64       
 7   L           62 non-null     float64       
 8   PIC         84 non-null     object        
 9   Unnamed: 9  13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 7.8+ KB


## Data Cleaning

### Inconsistent Data **#1**

#### Delete `NaN` rows and the inconsistent rows that we don't use

##### Delete `NaN` rows

In [4]:
df.tail()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
93,NaT,manda,40 dimsum,TOTAL PENJUALAN TJOEAN,254.0,83.0,93.0,158.0,,
94,NaT,,50 hekeng,11790,7620.0,1660.0,930.0,1580.0,,
95,NaT,ambun,20 dimsum,,,,,,,
96,NaT,,20 hekeng,,,,,,,
97,NaT,PIUTANG,,600,450.0,80.0,70.0,,,


In [5]:
# Rows to drop
rowsToDrop = df[df.isnull().all(axis=1)].index
rowsToDrop
# In a beginning of rows, there are also a 'NaN' values

Index([84, 87, 88, 91], dtype='int64')

In [6]:
# Delete based on the condition
df = df.drop(index=rowsToDrop)

In [7]:
df.tail()
# There isn't the all columns have 'NaN' values

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
93,NaT,manda,40 dimsum,TOTAL PENJUALAN TJOEAN,254.0,83.0,93.0,158.0,,
94,NaT,,50 hekeng,11790,7620.0,1660.0,930.0,1580.0,,
95,NaT,ambun,20 dimsum,,,,,,,
96,NaT,,20 hekeng,,,,,,,
97,NaT,PIUTANG,,600,450.0,80.0,70.0,,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94 entries, 0 to 97
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DATE        83 non-null     datetime64[ns]
 1   DEBIT       7 non-null      object        
 2   CREDIT      89 non-null     object        
 3   RECIPIENT   87 non-null     object        
 4   30          48 non-null     float64       
 5   20          20 non-null     float64       
 6   10          20 non-null     float64       
 7   L           62 non-null     float64       
 8   PIC         84 non-null     object        
 9   Unnamed: 9  13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 8.1+ KB


##### Delete the inconsistent rows that we don't use

In [9]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
0,NaT,,29525000,February's,104.0,35.0,31.0,59.0,,
1,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
2,2021-03-02,,135000,Celvin,,,,3.0,Manda,
3,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
4,2021-03-05,,135000,Nabila,,,,3.0,Manda,


In [10]:
df.tail(6)

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
92,NaT,UTANG,,,,,,,,
93,NaT,manda,40 dimsum,TOTAL PENJUALAN TJOEAN,254.0,83.0,93.0,158.0,,
94,NaT,,50 hekeng,11790,7620.0,1660.0,930.0,1580.0,,
95,NaT,ambun,20 dimsum,,,,,,,
96,NaT,,20 hekeng,,,,,,,
97,NaT,PIUTANG,,600,450.0,80.0,70.0,,,


In [11]:
# Condition where 'PIC' column values is not Manda and Ambun
rowsToDropIsNotPIC = df[~(df['PIC'].isin(['Manda', 'Ambun']))].index
rowsToDropIsNotPIC

Index([0, 30, 47, 70, 73, 77, 85, 86, 89, 90, 92, 93, 94, 95, 96, 97], dtype='int64')

In [12]:
# Delete based on the condition
df = df.drop(index=rowsToDropIsNotPIC)

# Adjust the index in ascending from start until finish
df = df.reset_index(drop=True)

In [13]:
df.info()
# Successfully adjust

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DATE        78 non-null     datetime64[ns]
 1   DEBIT       0 non-null      object        
 2   CREDIT      78 non-null     object        
 3   RECIPIENT   78 non-null     object        
 4   30          42 non-null     float64       
 5   20          14 non-null     float64       
 6   10          13 non-null     float64       
 7   L           54 non-null     float64       
 8   PIC         78 non-null     object        
 9   Unnamed: 9  13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


In [14]:
df.head()
# The first rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,,,,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
3,2021-03-05,,135000,Nabila,,,,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,,,,Manda,


In [15]:
df.tail()
# The last rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9
73,2021-03-28,,145000,Kak tya (IG),1.0,,,1.0,Manda,
74,2021-03-28,,100000,Mba didi,1.0,,,,Manda,
75,2021-03-28,,100000,Uti for Tante frisbi,1.0,,,,Ambun,
76,2021-03-29,,345000,Ichay,2.0,,3.0,1.0,Ambun,
77,2021-03-31,,200000,Ibu Melina (IG),2.0,,,,Manda,NO SEAFOOD


#### Rename `30`, `20`, `10`, `L` columns to make it easier to understand 

In [16]:
df[[30, 20, 10, 'L']].head()

Unnamed: 0,30,20,10,L
0,,,1.0,1.0
1,,,,3.0
2,2.0,,,4.0
3,,,,3.0
4,1.0,,,


In [17]:
# Rename the columns to improve readability
df = df.rename(columns={30:'Shumai 30 Pcs', 20:'Shumai 20 Pcs', 10:'Shumai 10 Pcs', 'L':'Chicken Lumpia 10 Pcs'})

In [18]:
df[['Shumai 30 Pcs', 'Shumai 20 Pcs', 'Shumai 10 Pcs', 'Chicken Lumpia 10 Pcs']]
# Now, it's getting better

Unnamed: 0,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,,,1.0,1.0
1,,,,3.0
2,2.0,,,4.0
3,,,,3.0
4,1.0,,,
...,...,...,...,...
73,1.0,,,1.0
74,1.0,,,
75,1.0,,,
76,2.0,,3.0,1.0


In [19]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,,,,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
3,2021-03-05,,135000,Nabila,,,,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,,,,Manda,


### Missing Value

#### Impute missing values of `DATE` column with mode

In [20]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          42 non-null     float64       
 5   Shumai 20 Pcs          14 non-null     float64       
 6   Shumai 10 Pcs          13 non-null     float64       
 7   Chicken Lumpia 10 Pcs  54 non-null     float64       
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,,,,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
3,2021-03-05,,135000,Nabila,,,,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,,,,Manda,


In [21]:
df['DATE'].mode()

0   2021-03-04
Name: DATE, dtype: datetime64[ns]

In [22]:
# Impute missing values of `DATE` column with mode
df['DATE'] = df['DATE'].fillna(df['DATE'].mode()[0])

In [23]:
df.info();
df.head()
# Successfully fill with mode

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          42 non-null     float64       
 5   Shumai 20 Pcs          14 non-null     float64       
 6   Shumai 10 Pcs          13 non-null     float64       
 7   Chicken Lumpia 10 Pcs  54 non-null     float64       
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,,,,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
3,2021-03-05,,135000,Nabila,,,,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,,,,Manda,


#### Impute missing values of `Shumai 10 Pcs`, `Shumai 20 Pcs`, `Shumai 30 Pcs`, `Chicken Lumpia 10 Pcs` column with 0

In [24]:
df.info();
df.head()
# As you can see, there's null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          42 non-null     float64       
 5   Shumai 20 Pcs          14 non-null     float64       
 6   Shumai 10 Pcs          13 non-null     float64       
 7   Chicken Lumpia 10 Pcs  54 non-null     float64       
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,,,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,,,,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,,,4.0,Manda,
3,2021-03-05,,135000,Nabila,,,,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,,,,Manda,


In [25]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs', 'Chicken Lumpia 10 Pcs']

# Impute missing values of products column with 0 
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].fillna(0)

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          78 non-null     float64       
 5   Shumai 20 Pcs          78 non-null     float64       
 6   Shumai 10 Pcs          78 non-null     float64       
 7   Chicken Lumpia 10 Pcs  78 non-null     float64       
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,0.0,0.0,1.0,1.0,Manda,
1,2021-03-02,,135000,Celvin,0.0,0.0,0.0,3.0,Manda,
2,2021-03-01,,380000,Reni,2.0,0.0,0.0,4.0,Manda,
3,2021-03-05,,135000,Nabila,0.0,0.0,0.0,3.0,Manda,
4,2021-03-01,,100000,Gery,1.0,0.0,0.0,0.0,Manda,


### Inconsistent Data **#2**

#### Convert the products category sales columns into integer

In [26]:
# Before convert the datatypes, we must ensure that the columns are not int
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          78 non-null     float64       
 5   Shumai 20 Pcs          78 non-null     float64       
 6   Shumai 10 Pcs          78 non-null     float64       
 7   Chicken Lumpia 10 Pcs  78 non-null     float64       
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 6.2+ KB


In [27]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs', 'Chicken Lumpia 10 Pcs']

# Convert into int64
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].astype('int64')

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          78 non-null     int64         
 5   Shumai 20 Pcs          78 non-null     int64         
 6   Shumai 10 Pcs          78 non-null     int64         
 7   Chicken Lumpia 10 Pcs  78 non-null     int64         
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,0,0,1,1,Manda,
1,2021-03-02,,135000,Celvin,0,0,0,3,Manda,
2,2021-03-01,,380000,Reni,2,0,0,4,Manda,
3,2021-03-05,,135000,Nabila,0,0,0,3,Manda,
4,2021-03-01,,100000,Gery,1,0,0,0,Manda,


## Data Selection

### Select certain data

In [28]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 78 non-null     object        
 3   RECIPIENT              78 non-null     object        
 4   Shumai 30 Pcs          78 non-null     int64         
 5   Shumai 20 Pcs          78 non-null     int64         
 6   Shumai 10 Pcs          78 non-null     int64         
 7   Chicken Lumpia 10 Pcs  78 non-null     int64         
 8   PIC                    78 non-null     object        
 9   Unnamed: 9             13 non-null     object        
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 6.2+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9
0,2021-03-01,,85000,Intan,0,0,1,1,Manda,
1,2021-03-02,,135000,Celvin,0,0,0,3,Manda,
2,2021-03-01,,380000,Reni,2,0,0,4,Manda,
3,2021-03-05,,135000,Nabila,0,0,0,3,Manda,
4,2021-03-01,,100000,Gery,1,0,0,0,Manda,


In [29]:
# Delete `DEBIT`, `CREDIT`, `RECIPIENT`, and `PIC` since we don't use it
df = df.drop(columns=['DEBIT', 'CREDIT', 'RECIPIENT', 'PIC', ])

# Delete columns that starts with 'Unnamed'
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   78 non-null     datetime64[ns]
 1   Shumai 30 Pcs          78 non-null     int64         
 2   Shumai 20 Pcs          78 non-null     int64         
 3   Shumai 10 Pcs          78 non-null     int64         
 4   Chicken Lumpia 10 Pcs  78 non-null     int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 3.2 KB


Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,2021-03-01,0,0,1,1
1,2021-03-02,0,0,0,3
2,2021-03-01,2,0,0,4
3,2021-03-05,0,0,0,3
4,2021-03-01,1,0,0,0


## Export the dataframe into excel files 

In [30]:
# Load datasets and convert it into excel based on month of sales's sheets
df = df.to_excel(r"..\..\data\dataCleaningOne\MAR21_dataCleaning.xlsx", index=False)

In [31]:
# selecting columns where column name contains 'Average' string
# df.filter(like='Average')