![alt text](pandas.png "Title")

# Duplicates

In [1]:
import pandas as pd
import numpy as np

In [4]:
# Create test data with duplicates
rawdata = {
    'gender': ['M', 'M', 'F', 'F', np.nan, np.nan],
    'age'   : [20, 21, 23, 23, 24, 24],
}

df = pd.DataFrame(rawdata)
df

Unnamed: 0,gender,age
0,M,20
1,M,21
2,F,23
3,F,23
4,,24
5,,24


In [15]:
# drop_duplicates() simply removes the duplicated row. By default we keep the first row. Method is not in-place
df.drop_duplicates()

#df.drop_duplicates(keep='first')

Unnamed: 0,gender,age
0,M,20
1,M,21
2,F,23
4,,24


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html

In [16]:
# In case we need to drop all the records that were duplicated
df.drop_duplicates(keep=False)

Unnamed: 0,gender,age
0,M,20
1,M,21


In [17]:
# We can also look at one column and remove rows with duplicated values. Again, we keep the first row by default
df.drop_duplicates(subset='gender')

Unnamed: 0,gender,age
0,M,20
2,F,23
4,,24


In [14]:
# Keep only the duplicated rows...
dups = df[ df.duplicated(keep=False) ]
dups

Unnamed: 0,gender,age
2,F,23
3,F,23
4,,24
5,,24


In [16]:
# ...and another way is to display rows are not in another dataframe:
nodup = df.drop_duplicates(keep=False)

dups = df.merge(nodup, how='left', indicator=True)
dups

Unnamed: 0,gender,age,_merge
0,M,20,both
1,M,21,both
2,F,23,left_only
3,F,23,left_only
4,,24,left_only
5,,24,left_only


In [17]:
# and then filter looking at the row origin:
dups[dups['_merge'] == 'left_only'] [['gender', 'age']]

Unnamed: 0,gender,age
2,F,23
3,F,23
4,,24
5,,24


In [23]:
# btw, here's how to get a list of unique values from a Series

print( set(df['gender']) )
print( df['gender'].unique() ) # that's a numpy array, an iterable

# both can be converted to a list, a tuple etc
print( tuple(df['gender'].unique()))

{nan, 'F', 'M'}
['M' 'F' nan]
('M', 'F', nan)


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+