#Data Cleansing - Pandas in a Notebook

In [None]:
#Release: 1.2109.1901

In [None]:
import pandas as pd
import numpy as np

# Create Dataframe

In [None]:
#Dataframe from dictionary
f = {'name':["apples", "bananas", "cherries", "pears", "pears"],
    'quantity':[20, 20, 50, 30, 40],
    'price':[1000,500,750,900,950]}
df = pd.DataFrame(f)

#Add new rows
fnew = {'name':["grapes", "apples", "grapes", "oranges"],
        'quantity':[30,25,30,20],
        'price':[1200,800,1200,700]}

dfnew = pd.DataFrame(fnew,index=[5, 6, 7, 8])
df2 = df.append(dfnew)

#combine dataframes
colors = pd.DataFrame({'name':["apples", "bananas", "pears", "grapes", "cherries", "plum"], 
                       'color':['red', 'yellow', 'green', 'purple', 'red', 'purple']})
df3 = pd.merge(df2, colors, on='name', how='outer')
df3

###Add some rows to create 'dirty' data

In [None]:
dirty = {'name':['Apples?', 'grapes', 'Lemons 2'],
        'quantity':['?', '?', np.nan],
        'price':[800,'?',700],
        'color':['green','?','yellow']}

df_dirty = pd.DataFrame(dirty,index=[10, 11, 12])
df_fruit = df3.append(df_dirty)
df_fruit

###Quick check the dataframe

In [None]:
df_fruit.info()

In [None]:
df_fruit.describe().transpose()

#Missing values - Select

Missing values can cause to biased or even failure in training, because scikit-learn cannot process missing values.
What we need to do : 
1. Identify the missing values representations (`UNK`, `NaN`, `NULL`, `-9999`, `?`, etc.)
2. Decide on what to do with the missing values 
	- Remove all rows with any missing values (must be careful, since some columns may have too many missing values while other columns do not)
	- Remove the column with too many missing values
	- Impute values : can be done with simple method (impute with mean or mean value), with statistical methods, or other methods such as nearest neighbors value



Fungsi isna dapat digunakan untuk memeriksa apakah terdapat missing value, atau nilai NULL

In [None]:
# isna
df_fruit.isna().any()

In [None]:
df_fruit.isna()

In [None]:
#Select the columns with NaN
df_fruit[df_fruit.columns[df_fruit.isna().any()]]

###Invalid Values - Clean

Clean `name` column value, remove all non alphabet character.

In [None]:
#select and print unique values to see if there's any invalid values
print(df_fruit['name'].unique()) 

In [None]:
#remove all non-alphabetical character 
df_fruit['name'] = df_fruit.name.str.replace(r'[^a-zA-Z]\s?',r'')
print(df_fruit['name'].unique()) 

In [None]:
#change all to lowercase
df_fruit['name'] = df_fruit.name.str.lower()
df_fruit

###Convert column type

Set column with value = '?' as missing value (NaN, i.e. `np.nan`)

In [None]:
for colname in ['quantity', 'price', 'color']:
  df_fruit.loc[df_fruit[colname] == '?', colname] = np.nan

df_fruit

In [None]:
df_fruit["quantity"] = df_fruit["quantity"].astype(str).astype(float)
df_fruit["price"] = df_fruit["price"].astype(str).astype(float)
df_fruit.info()
df_fruit

###Missing values - Remove

In [None]:
df_fruit.dropna() #drop all rows with NaN in any column  

In [None]:
df_fruit.dropna(subset = ['price', 'color'])

In [None]:
#let's update the dataframe
df_fruit.dropna(inplace=True)
df_fruit

###Duplicate rows 

In [None]:
#dropping identical rows, keep the first occurence
df_fruit.drop_duplicates(inplace = True) #set inplace = True for in place modification
df_fruit

In [None]:
#We can also remove duplicate based on column
df_fruit.drop_duplicates("name")

Save to csv format

In [None]:
df_fruit.to_csv('cleaned_data.csv', index=False)

# Revision History

Release: 1.2109.1901

* Code cleanup