In [4]:
#Import pandas and numpy
import pandas as pd
import numpy as np

#Load small test scores dataframe
test_scores = pd.read_csv('test_scores.csv')

#Make a copy of the dataframe
clean_scores = test_scores.copy()
clean_scores.head()

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15.0,95.0
1,Marc Fletcher,15.0,50.0
2,Naima Barry,,100.0
3,Kara Davis,15.0,
4,Zeeshan Gibson,14.0,100.0


In [2]:
if_duplicated = clean_scores.duplicated(['Name', 'Age'])
if_duplicated 

0     False
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9      True
10     True
11     True
dtype: bool

## Get duplicated rows

In [3]:
#Access the duplicated rows for duplicates in the Name and Age column
duplicate_rows = clean_scores.loc[clean_scores.duplicated(['Name', 'Age'])]
duplicate_rows

Unnamed: 0,Name,Age,Test A Score
5,Amy Linn,15,85
9,Marc Fletcher,15,32
10,Kara Davis,15,79
11,Amy Linn,15,95


In [8]:
# all duplicated rows for Amy Linn
Amy = clean_scores.loc[clean_scores['Name'] == 'Amy Linn']
Amy

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
5,Amy Linn,15,85
11,Amy Linn,15,95


## Gather information around duplicated rows

In [11]:
#Get the count of duplicated rows
clean_scores.duplicated(['Name', 'Age']).sum()

4

In [12]:
#Visually inspect the dataframe for any trends in the duplicates
#Are we seeing duplicate rows only for students who are 15 years-old?
clean_scores

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
1,Marc Fletcher,15,50
2,Naima Barry,,100
3,Kara Davis,15,
4,Zeeshan Gibson,14,100
5,Amy Linn,15,85
6,Dewey Cobb,Fourteen,Sixty six
7,Zeeshan Gibson,120,108
8,Lie�m Gibson,14,
9,Marc Fletcher,15,32


## Determine which duplicated row to remove

In [13]:
# Duplicated rows with some diffrent values
Amy = clean_scores.loc[clean_scores['Name'] == 'Amy Linn']
Amy

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
5,Amy Linn,15,85
11,Amy Linn,15,95


Steps to potentially remediate:

1. Check with data providers to confirm the data accuracy
2. Remove duplicated data if it is incorrect or keep the duplicated data if it is correct.

In [14]:
#Load a dataframe where duplicate scores on Test A are wrong
#But all scores (includng duplicate ones) on Test B are correct.
multi_test_scores = pd.read_csv('multiple_test_scores.csv')
multi_test_scores

Unnamed: 0,Name,Age,Test A Score,Test B Score
0,Amy Linn,15,95,34
1,Marc Fletcher,15,50,87
2,Naima Barry,,100,100
3,Kara Davis,15,,3
4,Zeeshan Gibson,14,100,20
5,Amy Linn,15,85,88
6,Dewey Cobb,Fourteen,Sixty six,Fifty three
7,Zeeshan Gibson,120,108,100
8,Lie�m Gibson,14,,75
9,Marc Fletcher,15,32,54


In [15]:
#Access the duplicated rows for duplicates in the Name and Age column
multi_test_scores[multi_test_scores.duplicated(['Name', 'Age'])]

Unnamed: 0,Name,Age,Test A Score,Test B Score
5,Amy Linn,15,85,88
9,Marc Fletcher,15,32,54
10,Kara Davis,15,79,90


Steps to potentially remediate:

1. Check with data providers, see an example response below:
- Duplicated students’ data in “Test A score” is incorrect and incorrect rows should be removed 
- Duplicated students’ data in “Test B score” is correct and should be kept
2. Mark the incorrect duplicate values for “Test A score” as NaNs; Or simply data structure by creating a separate table for the repeated values in Test B Score.

## Resolve the duplicated rows

In [16]:
# Remove the values where the duplicates are in the Name and Age columns
#By default, drop_duplicates() keep the first occurrence
remove_dup = clean_scores.drop_duplicates(subset=['Name', 'Age'])
remove_dup

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
1,Marc Fletcher,15,50
2,Naima Barry,,100
3,Kara Davis,15,
4,Zeeshan Gibson,14,100
6,Dewey Cobb,Fourteen,Sixty six
7,Zeeshan Gibson,120,108
8,Lie�m Gibson,14,


In [17]:
#The following defines keep=last, keeping the last occurrence
clean_scores.drop_duplicates(subset=['Name', 'Age'], keep='last')

Unnamed: 0,Name,Age,Test A Score
2,Naima Barry,,100
4,Zeeshan Gibson,14,100
6,Dewey Cobb,Fourteen,Sixty six
7,Zeeshan Gibson,120,108
8,Lie�m Gibson,14,
9,Marc Fletcher,15,32
10,Kara Davis,15,79
11,Amy Linn,15,95


In [18]:
remove_dup.duplicated(['Name', 'Age']).sum()

0

## How to drop rows that are neither the first or last occurrence

In [19]:
# Duplicated rows with some diffrent values
Amy = clean_scores.loc[clean_scores['Name'] == 'Amy Linn']
Amy

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
5,Amy Linn,15,85
11,Amy Linn,15,95


In [20]:
row_drop_example = Amy.drop([5])
row_drop_example

Unnamed: 0,Name,Age,Test A Score
0,Amy Linn,15,95
11,Amy Linn,15,95


## How to convert duplicate values to NaNs

In [21]:
#Access the index of the duplicated rows for duplicates
dupe_index = multi_test_scores[multi_test_scores.duplicated(['Name', 'Age'])].index
dupe_index

Int64Index([5, 9, 10], dtype='int64')

In [22]:
#Set duplicated values in Test A Score column to NANs
multi_test_scores.loc[dupe_index, 'Test A Score'] = np.nan

In [23]:
#Visually inspect to confirm the operation worked
multi_test_scores

Unnamed: 0,Name,Age,Test A Score,Test B Score
0,Amy Linn,15,95,34
1,Marc Fletcher,15,50,87
2,Naima Barry,,100,100
3,Kara Davis,15,,3
4,Zeeshan Gibson,14,100,20
5,Amy Linn,15,,88
6,Dewey Cobb,Fourteen,Sixty six,Fifty three
7,Zeeshan Gibson,120,108,100
8,Lie�m Gibson,14,,75
9,Marc Fletcher,15,,54
