# FEB21 - Data Cleaning

## Data Gather

### Importing libraries and load the datasets

In [1]:
from datetime import datetime
import pandas as pd
from openpyxl import Workbook
import numpy as np

In [2]:
#  Load the datasets
df = pd.read_excel(r"..\..\data\gatherData\FEB21_DimsumTJOEAN.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATE         126 non-null    object 
 1   DEBIT        3 non-null      object 
 2   CREDIT       133 non-null    object 
 3   RECIPIENT    133 non-null    object 
 4   30           82 non-null     float64
 5   20           35 non-null     float64
 6   10           30 non-null     float64
 7   L            36 non-null     float64
 8   PIC          132 non-null    object 
 9   Unnamed: 9   1 non-null      float64
 10  Unnamed: 10  0 non-null      float64
dtypes: float64(6), object(5)
memory usage: 12.2+ KB


## Data Cleaning

### Inconsistent Data **#1**

#### Delete `NaN` rows and the inconsistent rows that we don't use

##### Delete `NaN` rows

In [4]:
df.tail()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
136,,#TJOEAN,,,,,,,,,
137,,,,,,,,,,,
138,,,,,5880.0,1340.0,660.0,460.0,,,
139,,Utang Dimsum 100pcs,,,,,,,,,
140,,Promotion Budget,Indira,30pcs,8340.0,,,,,,


In [5]:
# Rows to drop
rowsToDrop = df[df.isnull().all(axis=1)].index
rowsToDrop
# In a beginning of rows, there are also a 'NaN' values

Index([1, 133, 134, 137], dtype='int64')

In [6]:
# Delete based on the condition
df = df.drop(index=rowsToDrop)

In [7]:
df.tail()
# There isn't the all columns have 'NaN' values

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
135,,,29525000,,104.0,35.0,31.0,59.0,TOTAL,,
136,,#TJOEAN,,,,,,,,,
138,,,,,5880.0,1340.0,660.0,460.0,,,
139,,Utang Dimsum 100pcs,,,,,,,,,
140,,Promotion Budget,Indira,30pcs,8340.0,,,,,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 137 entries, 0 to 140
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATE         126 non-null    object 
 1   DEBIT        3 non-null      object 
 2   CREDIT       133 non-null    object 
 3   RECIPIENT    133 non-null    object 
 4   30           82 non-null     float64
 5   20           35 non-null     float64
 6   10           30 non-null     float64
 7   L            36 non-null     float64
 8   PIC          132 non-null    object 
 9   Unnamed: 9   1 non-null      float64
 10  Unnamed: 10  0 non-null      float64
dtypes: float64(6), object(5)
memory usage: 12.8+ KB


##### Delete the inconsistent rows that we don't use

In [9]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
0,,,13010000,January's,92.0,32.0,35.0,,,,
2,2021-02-01 00:00:00,,100000,Kak fira (pack of 30's),1.0,,,,Ambun,0.0,
3,2021-02-02 00:00:00,,80000,Fira (pack of 20's),,1.0,,,Ambun,,
4,2021-02-02 00:00:00,,80000,Audi (pack of 20's),,1.0,,,Manda,,
5,,,40000,Santi,,,1.0,,Manda,,


In [10]:
df.tail(6)

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
132,2021-02-28 00:00:00,,200000,Tante Rini,1.0,1.0,1.0,,Manda,,
135,,,29525000,,104.0,35.0,31.0,59.0,TOTAL,,
136,,#TJOEAN,,,,,,,,,
138,,,,,5880.0,1340.0,660.0,460.0,,,
139,,Utang Dimsum 100pcs,,,,,,,,,
140,,Promotion Budget,Indira,30pcs,8340.0,,,,,,


In [11]:
# Condition where 'PIC' column values is not Manda and Ambun
rowsToDropIsNotPIC = df[~(df['PIC'].isin(['Manda', 'Ambun']))].index
rowsToDropIsNotPIC

Index([0, 13, 59, 60, 61, 135, 136, 138, 139, 140], dtype='int64')

In [12]:
# Delete based on the condition
df = df.drop(index=rowsToDropIsNotPIC)

# Adjust the index in ascending from start until finish
df = df.reset_index(drop=True)

In [13]:
df.info()
# Successfully adjust

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATE         122 non-null    object 
 1   DEBIT        0 non-null      object 
 2   CREDIT       126 non-null    object 
 3   RECIPIENT    127 non-null    object 
 4   30           75 non-null     float64
 5   20           32 non-null     float64
 6   10           26 non-null     float64
 7   L            34 non-null     float64
 8   PIC          127 non-null    object 
 9   Unnamed: 9   1 non-null      float64
 10  Unnamed: 10  0 non-null      float64
dtypes: float64(6), object(5)
memory usage: 11.0+ KB


In [14]:
df.head()
# The first rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
0,2021-02-01 00:00:00,,100000,Kak fira (pack of 30's),1.0,,,,Ambun,0.0,
1,2021-02-02 00:00:00,,80000,Fira (pack of 20's),,1.0,,,Ambun,,
2,2021-02-02 00:00:00,,80000,Audi (pack of 20's),,1.0,,,Manda,,
3,,,40000,Santi,,,1.0,,Manda,,
4,2021-02-04 00:00:00,,180000,Tari,1.0,1.0,,,Manda,,


In [15]:
df.tail()
# The last rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,PIC,Unnamed: 9,Unnamed: 10
122,2021-02-03 00:00:00,,275000,Aldhira,1.0,,1.0,3.0,Manda,,
123,2021-02-28 00:00:00,,90000,Mba lila,,,,2.0,Manda,,
124,2021-02-28 00:00:00,,125000,Ganis,,1.0,,1.0,Manda,,
125,2021-02-28 00:00:00,,90000,Audi,,,,2.0,Manda,,
126,2021-02-28 00:00:00,,200000,Tante Rini,1.0,1.0,1.0,,Manda,,


#### Rename `30`, `20`, `10`, `L` columns to make it easier to understand 

In [16]:
df[[30, 20, 10, 'L']].head()

Unnamed: 0,30,20,10,L
0,1.0,,,
1,,1.0,,
2,,1.0,,
3,,,1.0,
4,1.0,1.0,,


In [17]:
# Rename the columns to improve readability
df = df.rename(columns={30:'Shumai 30 Pcs', 20:'Shumai 20 Pcs', 10:'Shumai 10 Pcs', 'L':'Chicken Lumpia 10 Pcs'})

In [18]:
df[['Shumai 30 Pcs', 'Shumai 20 Pcs', 'Shumai 10 Pcs', 'Chicken Lumpia 10 Pcs']]
# Now, it's getting better

Unnamed: 0,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,1.0,,,
1,,1.0,,
2,,1.0,,
3,,,1.0,
4,1.0,1.0,,
...,...,...,...,...
122,1.0,,1.0,3.0
123,,,,2.0
124,,1.0,,1.0
125,,,,2.0


In [19]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9,Unnamed: 10
0,2021-02-01 00:00:00,,100000,Kak fira (pack of 30's),1.0,,,,Ambun,0.0,
1,2021-02-02 00:00:00,,80000,Fira (pack of 20's),,1.0,,,Ambun,,
2,2021-02-02 00:00:00,,80000,Audi (pack of 20's),,1.0,,,Manda,,
3,,,40000,Santi,,,1.0,,Manda,,
4,2021-02-04 00:00:00,,180000,Tari,1.0,1.0,,,Manda,,


### Missing Value

#### Impute missing values of `DATE` column with mode

In [20]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DATE                   122 non-null    object 
 1   DEBIT                  0 non-null      object 
 2   CREDIT                 126 non-null    object 
 3   RECIPIENT              127 non-null    object 
 4   Shumai 30 Pcs          75 non-null     float64
 5   Shumai 20 Pcs          32 non-null     float64
 6   Shumai 10 Pcs          26 non-null     float64
 7   Chicken Lumpia 10 Pcs  34 non-null     float64
 8   PIC                    127 non-null    object 
 9   Unnamed: 9             1 non-null      float64
 10  Unnamed: 10            0 non-null      float64
dtypes: float64(6), object(5)
memory usage: 11.0+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9,Unnamed: 10
0,2021-02-01 00:00:00,,100000,Kak fira (pack of 30's),1.0,,,,Ambun,0.0,
1,2021-02-02 00:00:00,,80000,Fira (pack of 20's),,1.0,,,Ambun,,
2,2021-02-02 00:00:00,,80000,Audi (pack of 20's),,1.0,,,Manda,,
3,,,40000,Santi,,,1.0,,Manda,,
4,2021-02-04 00:00:00,,180000,Tari,1.0,1.0,,,Manda,,


In [21]:
df['DATE'].mode()

0   2021-02-27
Name: DATE, dtype: datetime64[ns]

In [22]:
# Impute missing values of `DATE` column with mode
df['DATE'] = df['DATE'].fillna(df['DATE'].mode()[0])

In [23]:
df.info();
df.head()
# Successfully fill with mode

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   127 non-null    datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 126 non-null    object        
 3   RECIPIENT              127 non-null    object        
 4   Shumai 30 Pcs          75 non-null     float64       
 5   Shumai 20 Pcs          32 non-null     float64       
 6   Shumai 10 Pcs          26 non-null     float64       
 7   Chicken Lumpia 10 Pcs  34 non-null     float64       
 8   PIC                    127 non-null    object        
 9   Unnamed: 9             1 non-null      float64       
 10  Unnamed: 10            0 non-null      float64       
dtypes: datetime64[ns](1), float64(6), object(4)
memory usage: 11.0+ KB


Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,PIC,Unnamed: 9,Unnamed: 10
0,2021-02-01,,100000,Kak fira (pack of 30's),1.0,,,,Ambun,0.0,
1,2021-02-02,,80000,Fira (pack of 20's),,1.0,,,Ambun,,
2,2021-02-02,,80000,Audi (pack of 20's),,1.0,,,Manda,,
3,2021-02-27,,40000,Santi,,,1.0,,Manda,,
4,2021-02-04,,180000,Tari,1.0,1.0,,,Manda,,


### Inconsistent Data **#2**

#### Convert `DATE` column into the sum of the month selecting column that we want to use

In [24]:
# Turn 'DATE' into per month
df['DATE'] = df['DATE'].dt.to_period('M')

# Sum of the month and selecting column that we want to use
df = df.groupby('DATE')[['Shumai 30 Pcs', 'Shumai 20 Pcs', 'Shumai 10 Pcs', 'Chicken Lumpia 10 Pcs']].sum().reset_index()

In [25]:
df.head()

Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,2021-02,101.0,35.0,30.0,59.0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype    
---  ------                 --------------  -----    
 0   DATE                   1 non-null      period[M]
 1   Shumai 30 Pcs          1 non-null      float64  
 2   Shumai 20 Pcs          1 non-null      float64  
 3   Shumai 10 Pcs          1 non-null      float64  
 4   Chicken Lumpia 10 Pcs  1 non-null      float64  
dtypes: float64(4), period[M](1)
memory usage: 172.0 bytes


#### Convert the products category sales columns into integer

In [27]:
# Before convert the datatypes, we must ensure that the columns are not int
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype    
---  ------                 --------------  -----    
 0   DATE                   1 non-null      period[M]
 1   Shumai 30 Pcs          1 non-null      float64  
 2   Shumai 20 Pcs          1 non-null      float64  
 3   Shumai 10 Pcs          1 non-null      float64  
 4   Chicken Lumpia 10 Pcs  1 non-null      float64  
dtypes: float64(4), period[M](1)
memory usage: 172.0 bytes


In [28]:
# Convert into int64
df = df.astype({'Shumai 30 Pcs':'int64', 'Shumai 20 Pcs':'int64', 
                'Shumai 10 Pcs':'int64', 'Chicken Lumpia 10 Pcs':'int64'})

In [29]:
df.info();
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype    
---  ------                 --------------  -----    
 0   DATE                   1 non-null      period[M]
 1   Shumai 30 Pcs          1 non-null      int64    
 2   Shumai 20 Pcs          1 non-null      int64    
 3   Shumai 10 Pcs          1 non-null      int64    
 4   Chicken Lumpia 10 Pcs  1 non-null      int64    
dtypes: int64(4), period[M](1)
memory usage: 172.0 bytes


Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,2021-02,101,35,30,59


## Export the dataframe into excel files 

In [30]:
# Load datasets and convert it into excel based on month of sales's sheets
df = df.to_excel(r"..\..\data\dataCleaning\FEB21_dataCleaning.xlsx", index=False)

In [31]:
# selecting columns where column name contains 'Average' string
# df.filter(like='Average')