# Mod14 Handling Duplicated Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.19.1'

In [3]:
pd.__version__

'1.0.5'

### Operating on Duplicated data

In [4]:
students = [('Jack', 34, 'Sydeny'),
            ('Riti', 30, 'Delhi'),
            ('Aadi', 16, 'New York'),
            ('Riti', 30, 'Delhi'),
            ('Riti', 30, 'Delhi'),
            ('Riti', 30, 'Mumbai'),
            ('Aadi', 40, 'London'),
            ('Sachin', 30, 'Delhi')
            ]
 
# Create a DataFrame object
df = pd.DataFrame(students, columns=['Name', 'Age', 'City'])

In [5]:
df

Unnamed: 0,Name,Age,City
0,Jack,34,Sydeny
1,Riti,30,Delhi
2,Aadi,16,New York
3,Riti,30,Delhi
4,Riti,30,Delhi
5,Riti,30,Mumbai
6,Aadi,40,London
7,Sachin,30,Delhi


In [6]:
# check if any records are duplicates
df.duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

In [7]:
df.duplicated().any()

True

In [12]:
df.duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

In [9]:
%timeit df.duplicated().any()

475 µs ± 6.12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
df.duplicated().values

array([False, False, False,  True,  True, False, False, False])

In [8]:
df.duplicated().values.any()  #numpy的方法

True

In [10]:
%timeit df.duplicated().values.any()

440 µs ± 7.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
# count how many duplicates
df.duplicated().sum()

2

In [18]:
# count how many duplicates    keep=False 顯示真正重覆的筆數
df.duplicated(keep=False).sum()

3

In [22]:
# list duplicated records 效能較慢 # 1. shape ,2. dtypes, 3. info  4. head. 5. tail. 6. null. 7. duplicated().any(), + duplicated().sum() 筆數
df[df.duplicated()]

Unnamed: 0,Name,Age,City
3,Riti,30,Delhi
4,Riti,30,Delhi


In [23]:
# check duplicates in a particular column
df.duplicated(subset='City')

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7     True
dtype: bool

In [24]:
# check duplicates in some columns
df.duplicated(subset=['Name','City'])

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

Alternatively, you can add ‘keep’ and indicate
whether you’d like to keep:
* keep='first', keep the first argumen(default)
* keep='last', keep the last argument
* keep=False, drop all the duplicates

In [25]:
# show duplicated data, reserve last duplicated record
df[df.duplicated(subset=('Name','Age'),keep='last')]

Unnamed: 0,Name,Age,City
1,Riti,30,Delhi
3,Riti,30,Delhi
4,Riti,30,Delhi


In [26]:
# show duplicated data, reserve first duplicated record
df[df.duplicated(subset=('Name','Age'),keep='first')]

Unnamed: 0,Name,Age,City
3,Riti,30,Delhi
4,Riti,30,Delhi
5,Riti,30,Mumbai


In [27]:
# remove duplicated rows
df.drop_duplicates()

Unnamed: 0,Name,Age,City
0,Jack,34,Sydeny
1,Riti,30,Delhi
2,Aadi,16,New York
5,Riti,30,Mumbai
6,Aadi,40,London
7,Sachin,30,Delhi


In [28]:
# drop duplicates from column
df.drop_duplicates(subset=['Age','City'])

Unnamed: 0,Name,Age,City
0,Jack,34,Sydeny
1,Riti,30,Delhi
2,Aadi,16,New York
5,Riti,30,Mumbai
6,Aadi,40,London


In [29]:
df.drop_duplicates(subset=('Name','Age'),keep=False)

Unnamed: 0,Name,Age,City
0,Jack,34,Sydeny
2,Aadi,16,New York
6,Aadi,40,London
7,Sachin,30,Delhi


In [30]:
df.drop_duplicates(subset=('Name','Age'))

Unnamed: 0,Name,Age,City
0,Jack,34,Sydeny
1,Riti,30,Delhi
2,Aadi,16,New York
6,Aadi,40,London
7,Sachin,30,Delhi


## Lab

<b>有一個 DataFrame df，試著計算整列重複的資料有幾個?</b>

In [None]:
df = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
df

<b>計算 'k1' 欄有幾個重複的資料</b>

<b>試著將整列重複的拋棄，只留第一筆重複值</b>