# Cleaning Not Null Values

In [1]:
# Import library
import numpy as np
import pandas as pd


In [3]:
# create a simple data frame
df = pd.DataFrame({
    'Sex':['M', 'M', 'F', 'M', 'F', 'F', '?', 'Q'],
    'Age':[24, 20, 19, 19, 20, 20, 124, 21]
})

In [4]:
df

Unnamed: 0,Sex,Age
0,M,24
1,M,20
2,F,19
3,M,19
4,F,20
5,F,20
6,?,124
7,Q,21


In [5]:
# replace value for Sex and Age that is invalid
df.replace({
    'Sex':{
        '?':'F',
        'Q':'F'
    },
    'Age':{
        124:24
    }
})

Unnamed: 0,Sex,Age
0,M,24
1,M,20
2,F,19
3,M,19
4,F,20
5,F,20
6,F,24
7,F,21


# Handling data

In [6]:
# Create data frame
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]}) 

In [7]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [8]:
# Separating data
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [9]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [10]:
df = df['Data'].str.split('_', expand=True)

In [11]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [12]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [13]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [14]:
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [15]:
df['Country']

0      US 
1       UK
2       US
3       IT
4     I  T
Name: Country, dtype: object

In [16]:
df['Country'].str.replace(" ", "")

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [17]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [18]:
df['Country'] = df['Country'].str.replace(" ", "")
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,IT,2
