<a href="https://colab.research.google.com/github/soralee2821/coding/blob/master/2021_05_21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning not-null values

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29, 30, 24, 290, 25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [3]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df['Sex'].value_counts()

F    2
D    1
?    1
M    1
Name: Sex, dtype: int64

In [5]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [6]:
df['Sex'].replace({'D':'F','N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [7]:
df.replace({
    'Sex':{'D':'F',
           'N':'M'},
    'Age':{290:29}
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [8]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [9]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

In [10]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


### Checking duplicate values in Series

In [11]:
ambassadors = pd.Series([
                         'France',
                         'United Kingdom',
                         'United Kingdom',
                         'Italy',
                         'Germany',
                         'Germany',
                         'Germany'
                         ], index=[
                                   'Gérard Araud',
                                   'Kim Darroch',
                                   'Peter Westmacott',
                                   'Armando Varricchio',
                                   'Peter Wittig',
                                   'Peter Ammon',
                                   'Klaus Scharioth '
                         ])

In [12]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [13]:
# duplicated(default) : the first value keeps, and then repeated values are checked
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [14]:
# duplicated(keep='last') : the last value keeps, and formerly repeated values are checked
ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [15]:
# duplicated(keep=False) : all the duplicated values are checked
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [16]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [17]:
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [18]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

### Checking duplicate values in DataFrames

In [19]:
players = pd.DataFrame({
    'Name':[
            'Kobe Bryant',
            'LeBron James',
            'Kobe Bryant',
            'Carmelo Anthony',
            'Kobe Bryant'
            ],
    'Pos':[
           'SG',
           'SF',
           'SG',
           'SF',
           'SF'
           ]
})

In [20]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [21]:
# in DataFrame, two rows with the same values are considered to be duplicated (as to all the columns)
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
# customize the column which check duplicated values
players.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [23]:
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [24]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [25]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [26]:
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF
