# APR22 - Data Cleaning

## Data Gather

### Importing libraries and load the datasets

In [1]:
from datetime import datetime
import pandas as pd
from openpyxl import Workbook
import numpy as np

In [2]:
#  Load the datasets
df = pd.read_excel(r"..\..\data\gatherData\APR22_DimsumTJOEAN.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   DATE         69 non-null     datetime64[ns]
 1   DEBIT        10 non-null     object        
 2   CREDIT       80 non-null     object        
 3   RECIPIENT    77 non-null     object        
 4   30           28 non-null     float64       
 5   20           15 non-null     float64       
 6   10           27 non-null     float64       
 7   L            41 non-null     float64       
 8   R            27 non-null     float64       
 9   G            6 non-null      float64       
 10  PIC          69 non-null     object        
 11  Unnamed: 11  6 non-null      object        
 12  Unnamed: 12  0 non-null      float64       
 13  Unnamed: 13  0 non-null      float64       
 14  Unnamed: 14  1 non-null      object        
dtypes: datetime64[ns](1), float64(8), object(6)
memory usage: 1

## Data Cleaning

### Inconsistent Data **#1**

#### Delete `NaN` rows and the inconsistent rows that we don't use

##### Delete `NaN` rows

In [4]:
df.tail()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
93,NaT,,rendang,,,,,,,,,,,,
94,NaT,,,,,,,,,,,,,,
95,NaT,Ambun,Shumai,280.0,,,,,,,,,,,
96,NaT,,Lumpia,200.0,,,,,,,,,,,
97,NaT,,Rendang,15.0,,,,,,,,,,,


In [5]:
# Rows to drop
rowsToDrop = df[df.isnull().all(axis=1)].index
rowsToDrop
# In a beginning of rows, there are also a 'NaN' values

Index([69, 72, 73, 76, 81, 83, 84, 85, 87, 88, 89, 94], dtype='int64')

In [6]:
# Delete based on the condition
df = df.drop(index=rowsToDrop)

In [7]:
df.tail()
# There isn't the all columns have 'NaN' values

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
92,NaT,,Lumpia,40.0,,,,,,,,,,,
93,NaT,,rendang,,,,,,,,,,,,
95,NaT,Ambun,Shumai,280.0,,,,,,,,,,,
96,NaT,,Lumpia,200.0,,,,,,,,,,,
97,NaT,,Rendang,15.0,,,,,,,,,,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86 entries, 0 to 97
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   DATE         69 non-null     datetime64[ns]
 1   DEBIT        10 non-null     object        
 2   CREDIT       80 non-null     object        
 3   RECIPIENT    77 non-null     object        
 4   30           28 non-null     float64       
 5   20           15 non-null     float64       
 6   10           27 non-null     float64       
 7   L            41 non-null     float64       
 8   R            27 non-null     float64       
 9   G            6 non-null      float64       
 10  PIC          69 non-null     object        
 11  Unnamed: 11  6 non-null      object        
 12  Unnamed: 12  0 non-null      float64       
 13  Unnamed: 13  0 non-null      float64       
 14  Unnamed: 14  1 non-null      object        
dtypes: datetime64[ns](1), float64(8), object(6)
memory usage: 10.8+ 

##### Delete the inconsistent rows that we don't use

In [9]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-01,,187603025,March's,27.0,5.0,5.0,35.0,16.0,2.0,,,,,
1,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
2,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
3,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
4,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,


In [10]:
df.tail(6)

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
91,NaT,Manda,Shumai,40.0,,,,,,,,,,,
92,NaT,,Lumpia,40.0,,,,,,,,,,,
93,NaT,,rendang,,,,,,,,,,,,
95,NaT,Ambun,Shumai,280.0,,,,,,,,,,,
96,NaT,,Lumpia,200.0,,,,,,,,,,,
97,NaT,,Rendang,15.0,,,,,,,,,,,


In [11]:
# Condition where 'PIC' column values is not Manda and Ambun
rowsToDropIsNotPIC = df[~(df['PIC'].isin(['Manda', 'Ambun']))].index
rowsToDropIsNotPIC

Index([0, 46, 49, 70, 71, 74, 75, 77, 78, 79, 80, 82, 86, 90, 91, 92, 93, 95,
       96, 97],
      dtype='int64')

In [12]:
# Delete based on the condition
df = df.drop(index=rowsToDropIsNotPIC)

# Adjust the index in ascending from start until finish
df = df.reset_index(drop=True)

In [13]:
df.info()
# Successfully adjust

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   DATE         66 non-null     datetime64[ns]
 1   DEBIT        0 non-null      object        
 2   CREDIT       66 non-null     object        
 3   RECIPIENT    66 non-null     object        
 4   30           23 non-null     float64       
 5   20           10 non-null     float64       
 6   10           21 non-null     float64       
 7   L            37 non-null     float64       
 8   R            23 non-null     float64       
 9   G            3 non-null      float64       
 10  PIC          66 non-null     object        
 11  Unnamed: 11  6 non-null      object        
 12  Unnamed: 12  0 non-null      float64       
 13  Unnamed: 13  0 non-null      float64       
 14  Unnamed: 14  1 non-null      object        
dtypes: datetime64[ns](1), float64(8), object(6)
memory usage: 7

In [14]:
df.head()
# The first rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,,,,,,Manda,,,,


In [15]:
df.tail()
# The last rows that we don't use is successfully deleted

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,30,20,10,L,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
61,2022-04-27,,190000,Wan kiki,1.0,,,2.0,,,Manda,,,,
62,2022-04-24,,1300000,kak ayu [EID2x3] +normal order,3.0,,9.0,13.0,,,Manda,diskon 25,,,
63,2022-04-25,,270000,tante nana,1.0,,2.0,2.0,,,Manda,,,,
64,2022-04-30,,250000,Tante budi (temen uti),,,,,2.0,,Manda,,,,
65,2022-04-29,,300000,rika,3.0,,,,,,Ambun,,,,


#### Rename `30`, `20`, `10`, `L` columns to make it easier to understand 

In [16]:
df[[30, 20, 10, 'L']].head()

Unnamed: 0,30,20,10,L
0,,1.0,,2.0
1,,,,
2,,2.0,2.0,
3,,1.0,4.0,5.0
4,1.0,,,


In [17]:
# Rename the columns to improve readability
df = df.rename(columns={30:'Shumai 30 Pcs', 20:'Shumai 20 Pcs', 10:'Shumai 10 Pcs', 'L':'Chicken Lumpia 10 Pcs'})

In [18]:
df[['Shumai 30 Pcs', 'Shumai 20 Pcs', 'Shumai 10 Pcs', 'Chicken Lumpia 10 Pcs']]
# Now, it's getting better

Unnamed: 0,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,,1.0,,2.0
1,,,,
2,,2.0,2.0,
3,,1.0,4.0,5.0
4,1.0,,,
...,...,...,...,...
61,1.0,,,2.0
62,3.0,,9.0,13.0
63,1.0,,2.0,2.0
64,,,,


In [19]:
df.head()

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,,,,,,Manda,,,,


### Missing Value

#### Impute missing values of `DATE` column with mode

In [20]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          23 non-null     float64       
 5   Shumai 20 Pcs          10 non-null     float64       
 6   Shumai 10 Pcs          21 non-null     float64       
 7   Chicken Lumpia 10 Pcs  37 non-null     float64       
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,,,,,,Manda,,,,


In [21]:
df['DATE'].mode()

0   2022-04-28
Name: DATE, dtype: datetime64[ns]

In [22]:
# Impute missing values of `DATE` column with mode
df['DATE'] = df['DATE'].fillna(df['DATE'].mode()[0])

In [23]:
df.info();
df.head()
# Successfully fill with mode

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          23 non-null     float64       
 5   Shumai 20 Pcs          10 non-null     float64       
 6   Shumai 10 Pcs          21 non-null     float64       
 7   Chicken Lumpia 10 Pcs  37 non-null     float64       
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,,,,,,Manda,,,,


#### Impute missing values of `Shumai 10 Pcs`, `Shumai 20 Pcs`, `Shumai 30 Pcs`, `Chicken Lumpia 10 Pcs` column with 0

In [24]:
df.info();
df.head()
# As you can see, there's null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          23 non-null     float64       
 5   Shumai 20 Pcs          10 non-null     float64       
 6   Shumai 10 Pcs          21 non-null     float64       
 7   Chicken Lumpia 10 Pcs  37 non-null     float64       
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,,1.0,,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,,,,,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,,2.0,2.0,,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,,,,,,Manda,,,,


In [25]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs', 'Chicken Lumpia 10 Pcs']

# Impute missing values of products column with 0 
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].fillna(0)

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          66 non-null     float64       
 5   Shumai 20 Pcs          66 non-null     float64       
 6   Shumai 10 Pcs          66 non-null     float64       
 7   Chicken Lumpia 10 Pcs  66 non-null     float64       
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,0.0,1.0,0.0,2.0,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,0.0,0.0,0.0,0.0,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,0.0,2.0,2.0,0.0,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,0.0,1.0,4.0,5.0,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1.0,0.0,0.0,0.0,,,Manda,,,,


### Inconsistent Data **#2**

#### Convert the products category sales columns into integer

In [26]:
# Before convert the datatypes, we must ensure that the columns are not int
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          66 non-null     float64       
 5   Shumai 20 Pcs          66 non-null     float64       
 6   Shumai 10 Pcs          66 non-null     float64       
 7   Chicken Lumpia 10 Pcs  66 non-null     float64       
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

In [27]:
# List of columns
listOfProducts = ['Shumai 10 Pcs', 'Shumai 20 Pcs', 'Shumai 30 Pcs', 'Chicken Lumpia 10 Pcs']

# Convert into int64
for cols in listOfProducts:
    for i in range(1, len(df)+1):
        df[cols] = df[cols].astype('int64')

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          66 non-null     int64         
 5   Shumai 20 Pcs          66 non-null     int64         
 6   Shumai 10 Pcs          66 non-null     int64         
 7   Chicken Lumpia 10 Pcs  66 non-null     int64         
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,0,1,0,2,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,0,0,0,0,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,0,2,2,0,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,0,1,4,5,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1,0,0,0,,,Manda,,,,


## Data Selection

### Select certain data

In [28]:
df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   DEBIT                  0 non-null      object        
 2   CREDIT                 66 non-null     object        
 3   RECIPIENT              66 non-null     object        
 4   Shumai 30 Pcs          66 non-null     int64         
 5   Shumai 20 Pcs          66 non-null     int64         
 6   Shumai 10 Pcs          66 non-null     int64         
 7   Chicken Lumpia 10 Pcs  66 non-null     int64         
 8   R                      23 non-null     float64       
 9   G                      3 non-null      float64       
 10  PIC                    66 non-null     object        
 11  Unnamed: 11            6 non-null      object        
 12  Unnamed: 12            0 non-null      float64       
 13  Unnamed

Unnamed: 0,DATE,DEBIT,CREDIT,RECIPIENT,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs,R,G,PIC,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,2022-04-02,,250000,intan,0,1,0,2,,2.0,Manda,,,,
1,2022-04-02,,600000,saras,0,0,0,0,5.0,,Manda,Diskon 25,,,
2,2022-04-01,,325000,bu tari,0,2,2,0,1.0,,Manda,,,,
3,2022-04-01,,465000,tari,0,1,4,5,,,Manda,,,,
4,2022-04-01,,100000,mba tora,1,0,0,0,,,Manda,,,,


In [29]:
# Delete `DEBIT`, `CREDIT`, `RECIPIENT`, `PIC`, `R` since we don't use it
df = df.drop(columns=['DEBIT', 'CREDIT', 'RECIPIENT', 'PIC', 'R'])

# Index of 'Chicken Lumpia 10 Pcs'
chickenLumpiaIndex = df.columns.get_loc('Chicken Lumpia 10 Pcs')

# Take all columns before chickenLumpiaIndex position
df = df.iloc[:, :chickenLumpiaIndex + 1]

df.info();
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DATE                   66 non-null     datetime64[ns]
 1   Shumai 30 Pcs          66 non-null     int64         
 2   Shumai 20 Pcs          66 non-null     int64         
 3   Shumai 10 Pcs          66 non-null     int64         
 4   Chicken Lumpia 10 Pcs  66 non-null     int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 2.7 KB


Unnamed: 0,DATE,Shumai 30 Pcs,Shumai 20 Pcs,Shumai 10 Pcs,Chicken Lumpia 10 Pcs
0,2022-04-02,0,1,0,2
1,2022-04-02,0,0,0,0
2,2022-04-01,0,2,2,0
3,2022-04-01,0,1,4,5
4,2022-04-01,1,0,0,0


## Export the dataframe into excel files 

In [30]:
# Load datasets and convert it into excel based on month of sales's sheets
df = df.to_excel(r"..\..\data\dataCleaningOne\APR22_dataCleaning.xlsx", index=False)

In [31]:
# selecting columns where column name contains 'Average' string
# df.filter(like='Average')