# Section 6: Data Cleaning

## Missing Data
- things like a 0, empty cell, or a string for a price value are considered missing data
- like python falsy values

In [1]:
import numpy as np
import pandas as pd

In [30]:
#anything that is not in this list is a Truthy value
falsy_values = (0, False, None, '', [], {})

In [3]:
any(falsy_values)

False

In [5]:
#not a number
np.nan

nan

In [7]:
#anything you do with np.nan becomes np.nan
a = np.array([1,2,3,np.nan,np.nan,4])

In [8]:
a.sum()

nan

In [9]:
a.mean()

nan

In [12]:
#numpy's infinite type, behaves like a virus, just like nan
np.inf

inf

#### checking for nan or inf

In [14]:
#same thing for inf
np.isnan(np.nan)

True

In [15]:
#or checking for both
np.isfinite(np.nan), np.isfinite(np.inf)

(False, False)

These can take arrays as inputs and return boolean arrays

#### filtering them out

In [16]:
a = np.array([1,2,3,np.nan,np.nan,4])

In [20]:
#creates new array with only non nan and non inf elements
a[np.isfinite(a)]

array([1., 2., 3., 4.])

In [21]:
#now can perform normal operations on it
a[np.isfinite(a)].sum()

10.0

## Handling Missing Data with Pandas

In [22]:
import numpy as np
import pandas as pd

### Functions to detect null values

In [24]:
pd.isnull(np.nan), pd.isnull(None)

(True, True)

In [27]:
#is a synonym of the above
pd.isna(np.nan)

True

In [28]:
#opposite one
pd.notnull(np.nan), pd.notna(None)

(False, False)

In [29]:
pd.notnull(3)

True

In [31]:
#can also pass it series and dataframes
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [33]:
#they return series and dataframes correspondingly
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


### Pandas Operations with Missing Values
- unlike numpy, pandas can do operations with missing values by just ignoring them

In [34]:
pd.Series([1,2,np.nan]).count()

2

In [35]:
pd.Series([1,2,np.nan]).mean()

1.5

### Filtering missing data

In [36]:
s = pd.Series([1,2,3,np.nan,np.nan,4])

In [37]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [38]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

### Dropping null values

#### On series

In [39]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [42]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

#### On DataFrames
- a little bit harder because can't drop single values, can only drop columns and rows

In [43]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [45]:
df.shape

(4, 4)

In [46]:
#know we have 4 rows, so if only have 3 non-null, 1 value is null
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [47]:
#default drops rows with at least one null value
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [48]:
#can also drop columns instead
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [50]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [51]:
#only drops row if all values are null
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [52]:
df.dropna(how='any')  # default behavior

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [53]:
#if at least 3 values are non-null, the row is kept
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


### Filling null values
- sometimes can be replaced by a 0 or the mean value for that column, etc

In [54]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [56]:
#fill with arbitrary value
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [57]:
#fill with mean
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [58]:
#filling with close value from above
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [59]:
#fill with close value from below
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

the bad thing about using close values is that cell might be left blank if it is at extreme

### Filling null values on DataFrames
- similar but specify axis (row or column) to use to fill the values

In [60]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [62]:
df.fillna({'Column A': 0, 'Column B': 99, 'Column C': df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [64]:
#close values by column
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [65]:
#close values by rows
df.fillna(method='ffill', axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


### Checking if there are NAs

#### Checking the length
- if there are missing values, s.dropna() will have less elements than s

In [70]:
len(s), len(s.dropna())

(6, 4)

In [68]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

#### More Pythony solution 'any'

In [72]:
#are any of the values True
pd.Series([True,False,False]).any()

True

In [73]:
#are all the values True
pd.Series([True,False,False]).all()

False

In [76]:
#this tells us at least one value in s is a null value
s.isnull().any()

True

## Cleaning not-null values
- if values are not null values, but are invalid for some reason

In [77]:
import numpy as np
import pandas as pd

In [79]:
#have invalid 290 for age and D and ? for sex
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25],
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [80]:
#first get all values of specific column
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [81]:
#replace invalid values ('to be replaced', 'replace with')
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [82]:
#can have a dictionary to replace all invalids at once
df['Sex'].replace({'D': 'F', 'N': 'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [83]:
#if wanted to replace values in many columns, could use dataframe type
df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [85]:
#if saw that some ages had an extra 0 at the end
df.loc[df['Age']>100, 'Age'] = df.loc[df['Age']>100, 'Age']/10
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


### Duplicates in Series

In [86]:
#can only invite one ambassador per country
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [87]:
#tells you which values are duplicated
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [88]:
#default it keeps the first appearance as the non duplicate, can change it though
ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [89]:
#or mark all of them as duplicates
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [92]:
#to drop the duplicates, keep parameters work the same way
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

### Duplicates in DataFrames
- happen at row level. 2 rows with same values are duplicates.

In [93]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [94]:
#default = all column values need to be the same
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [95]:
#choose which columns to look at to find duplicates
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

the drop_duplicates works the same way and takes the same parameters

### Text Handling
- most complex type, these are some of the most common cases:

#### Splitting Columns
- when data comes as one single value, but actually have multiple pieces of info in it

In [96]:
#should have columns for year, sex, coutry, number of children
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [102]:
#split method splits acording to (1st argument), _ in this case
df['Data'].str.split('_', expand=True)

KeyError: 'Data'

In [101]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [104]:
df.columns = ['Year','Sex','Country','# Children']
df

Unnamed: 0,Year,Sex,Country,# Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [105]:
#checking if a string contains a certain thing
df['Year'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [106]:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [107]:
#removing spaces in country names
df['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object