# JAN21 - Data Cleaning

## Data Gather

### Importing libraries and load the datasets

In [1]:
from datetime import datetime
import pandas as pd
from openpyxl import Workbook
import numpy as np

In [2]:
#  Load the datasets
df = pd.read_excel(r"..\..\data\gatherData\JAN21_DimsumTJOEAN.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   DATE       73 non-null     datetime64[ns]
 1   DEBIT      2 non-null      object        
 2   CREDIT     115 non-null    object        
 3   RECIPIENT  112 non-null    object        
 4   30         63 non-null     float64       
 5   20         29 non-null     float64       
 6   10         26 non-null     float64       
 7   PIC        105 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 7.7+ KB


## Data Cleaning

### Inconsistent Data **#1**

#### Delete `NaN` rows from first occurrence

In [4]:
df.tail(20)

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,PIC
101,2021-01-31,,100000,Mba diandra,1.0,,,Manda
102,2021-01-31,,300000,Melina (IG),2.0,1.0,1.0,Manda
103,2021-01-31,,100000,Nacil (temen ilham),1.0,,,Ambun
104,NaT,,,,,,,
105,NaT,,13010000,,92.0,32.0,35.0,TOTAL
106,NaT,#TJOEAN,,,,,,
107,NaT,,,,,,,
108,NaT,,,,,,,
109,NaT,,,,,,,
110,NaT,Promotion Budget,10,Ayu,,,,


In [5]:
# Rows to drop
rowsToDrop = df[df.isnull().all(axis=1)].index

# Drop the first occurence of 'NaN' values etc
df = df.drop(index=range(rowsToDrop[0], rowsToDrop[-1] + 1))

In [6]:
df.tail(3)
# There isn't the all columns have 'NaN' values

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,PIC
102,2021-01-31,,300000,Melina (IG),2.0,1.0,1.0,Manda
103,2021-01-31,,100000,Nacil (temen ilham),1.0,,,Ambun
120,NaT,,tante naz+buica udah di tambahin di debit un ?,,,,,


In [7]:
# Adjust the index in ascending from start until finish
df = df.reset_index(drop=True)

In [8]:
df.tail(3)
# Successfully adjust

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,PIC
102,2021-01-31,,300000,Melina (IG),2.0,1.0,1.0,Manda
103,2021-01-31,,100000,Nacil (temen ilham),1.0,,,Ambun
104,NaT,,tante naz+buica udah di tambahin di debit un ?,,,,,


#### Delete the last rows that we don't use

In [9]:
df = df.drop(df.index[-1])

In [10]:
df.tail(3)
# The last rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,PIC
101,2021-01-31,,100000,Mba diandra,1.0,,,Manda
102,2021-01-31,,300000,Melina (IG),2.0,1.0,1.0,Manda
103,2021-01-31,,100000,Nacil (temen ilham),1.0,,,Ambun


#### Rename `30`, `20`, `10` columns to make it easier to understand 

In [11]:
df[[30, 20, 10]].head()

Unnamed: 0,30,20,10
0,1.0,,
1,2.0,,
2,1.0,,
3,1.0,,
4,,1.0,


In [12]:
# Rename the columns to improve readability
df = df.rename(columns={30:'Shumai 30 Pcs', 20:'Shumai 20 Pcs', 10:'Shumai 10 Pcs'})

In [13]:
df[['Shumai 30 Pcs', 'Shumai 20 Pcs', 'Shumai 10 Pcs']]
# Now, it's getting better

Unnamed: 0,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs
0,1.0,,
1,2.0,,
2,1.0,,
3,1.0,,
4,,1.0,
...,...,...,...
99,2.0,,
100,1.0,,
101,1.0,,
102,2.0,1.0,1.0


In [14]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,NaT,,100000,Intan (pack of 30's),1.0,,,Manda
1,NaT,,200000,Steffie (pack of 60's),2.0,,,Manda
2,NaT,,100000,Ajis (pack of 30's),1.0,,,Manda
3,NaT,,100000,Riri (pack of 30's),1.0,,,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),,1.0,,Ambun


### Missing Value

#### Impute missing values of `DATE` column with mode

In [15]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           73 non-null     datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  62 non-null     float64       
 5   Shumai 20 Pcs  28 non-null     float64       
 6   Shumai 10 Pcs  25 non-null     float64       
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,NaT,,100000,Intan (pack of 30's),1.0,,,Manda
1,NaT,,200000,Steffie (pack of 60's),2.0,,,Manda
2,NaT,,100000,Ajis (pack of 30's),1.0,,,Manda
3,NaT,,100000,Riri (pack of 30's),1.0,,,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),,1.0,,Ambun


In [16]:
df['DATE'].mode()

0   2021-01-29
1   2021-01-31
Name: DATE, dtype: datetime64[ns]

In [17]:
# Impute missing values of `DATE` column with mode
df['DATE'] = df['DATE'].fillna(df['DATE'].mode()[0])

In [18]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  62 non-null     float64       
 5   Shumai 20 Pcs  28 non-null     float64       
 6   Shumai 10 Pcs  25 non-null     float64       
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,2021-01-29,,100000,Intan (pack of 30's),1.0,,,Manda
1,2021-01-29,,200000,Steffie (pack of 60's),2.0,,,Manda
2,2021-01-29,,100000,Ajis (pack of 30's),1.0,,,Manda
3,2021-01-29,,100000,Riri (pack of 30's),1.0,,,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),,1.0,,Ambun


#### Impute missing values of `Shumai 10 Pcs`, `Shumai 20 Pcs`, `Shumai 30 Pcs`, column with 0

In [19]:
df.info();
df.head()
# As you can see, there's null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  62 non-null     float64       
 5   Shumai 20 Pcs  28 non-null     float64       
 6   Shumai 10 Pcs  25 non-null     float64       
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,2021-01-29,,100000,Intan (pack of 30's),1.0,,,Manda
1,2021-01-29,,200000,Steffie (pack of 60's),2.0,,,Manda
2,2021-01-29,,100000,Ajis (pack of 30's),1.0,,,Manda
3,2021-01-29,,100000,Riri (pack of 30's),1.0,,,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),,1.0,,Ambun


In [20]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs']

# Impute missing values of products column with 0 
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].fillna(0)

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  104 non-null    float64       
 5   Shumai 20 Pcs  104 non-null    float64       
 6   Shumai 10 Pcs  104 non-null    float64       
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,2021-01-29,,100000,Intan (pack of 30's),1.0,0.0,0.0,Manda
1,2021-01-29,,200000,Steffie (pack of 60's),2.0,0.0,0.0,Manda
2,2021-01-29,,100000,Ajis (pack of 30's),1.0,0.0,0.0,Manda
3,2021-01-29,,100000,Riri (pack of 30's),1.0,0.0,0.0,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),0.0,1.0,0.0,Ambun


### Inconsistent Data **#2**

#### Convert the product's category sales columns into integer

In [21]:
# Before convert the datatypes, we must ensure that the columns are not int
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  104 non-null    float64       
 5   Shumai 20 Pcs  104 non-null    float64       
 6   Shumai 10 Pcs  104 non-null    float64       
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 6.6+ KB


In [22]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs']

# Convert into int64
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].astype('int64')

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  104 non-null    int64         
 5   Shumai 20 Pcs  104 non-null    int64         
 6   Shumai 10 Pcs  104 non-null    int64         
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,2021-01-29,,100000,Intan (pack of 30's),1,0,0,Manda
1,2021-01-29,,200000,Steffie (pack of 60's),2,0,0,Manda
2,2021-01-29,,100000,Ajis (pack of 30's),1,0,0,Manda
3,2021-01-29,,100000,Riri (pack of 30's),1,0,0,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),0,1,0,Ambun


## Data Selection

### Select certain columns

In [25]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   DEBIT          0 non-null      object        
 2   CREDIT         104 non-null    object        
 3   RECIPIENT      104 non-null    object        
 4   Shumai 30 Pcs  104 non-null    int64         
 5   Shumai 20 Pcs  104 non-null    int64         
 6   Shumai 10 Pcs  104 non-null    int64         
 7   PIC            104 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 6.6+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,PIC
0,2021-01-29,,100000,Intan (pack of 30's),1,0,0,Manda
1,2021-01-29,,200000,Steffie (pack of 60's),2,0,0,Manda
2,2021-01-29,,100000,Ajis (pack of 30's),1,0,0,Manda
3,2021-01-29,,100000,Riri (pack of 30's),1,0,0,Manda
4,2021-01-14,,80000,Afuza (pack of 20's),0,1,0,Ambun


In [26]:
# Delete `DEBIT`, `CREDIT`, `RECIPIENT`, and `PIC` since we don't use it
df = df.drop(columns=['DEBIT', 'CREDIT', 'RECIPIENT', 'PIC'])

In [27]:
df.info();
df.head()
# Successfully deleted

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           104 non-null    datetime64[ns]
 1   Shumai 30 Pcs  104 non-null    int64         
 2   Shumai 20 Pcs  104 non-null    int64         
 3   Shumai 10 Pcs  104 non-null    int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 3.4 KB


Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs
0,2021-01-29,1,0,0
1,2021-01-29,2,0,0
2,2021-01-29,1,0,0
3,2021-01-29,1,0,0
4,2021-01-14,0,1,0


### Create new column for Chicken Lumpia

In [28]:
# Assign 0 to a new column
df['Chicken Lumpia 10 Pcs'] = 0

In [29]:
df

Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,2021-01-29,1,0,0,0
1,2021-01-29,2,0,0,0
2,2021-01-29,1,0,0,0
3,2021-01-29,1,0,0,0
4,2021-01-14,0,1,0,0
...,...,...,...,...,...
99,2021-01-31,2,0,0,0
100,2021-01-31,1,0,0,0
101,2021-01-31,1,0,0,0
102,2021-01-31,2,1,1,0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   104 non-null    datetime64[ns]
 1   Shumai 30 Pcs          104 non-null    int64         
 2   Shumai 20 Pcs          104 non-null    int64         
 3   Shumai 10 Pcs          104 non-null    int64         
 4   Chicken Lumpia 10 Pcs  104 non-null    int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 4.2 KB


## Export the dataframe into excel files 

In [31]:
# Load datasets and convert it into excel based on month of sales's sheets
df = df.to_excel(r"..\..\data\dataCleaningOne\JAN21_dataCleaning.xlsx", index=False)

In [32]:
# selecting columns where column name contains 'Average' string
# df.filter(like='Average')