# Python Pandas identify and drop duplicate data

* identify  duplicate rows in Pandas
* find duplicate values in a column
* identify duplicate values in several columns
* drop duplicated data in all columns
* drop duplicated data in several column

Bonus

* find duplicates in index
* find duplicate data in a row
* delete columns with duplicates

In [1]:
# https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv

# read a dataset movies
import pandas as pd
movies = pd.read_csv('../csv/movie_metadata.csv', 
                     usecols=['title_year', 'movie_title', 'director_name', 'plot_keywords', 'budget']
                    )
movies['movie_title'] = movies.movie_title.str.strip()
movies.head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
0,James Cameron,Avatar,avatar|future|marine|native|paraplegic,237000000.0,2009.0
1,Gore Verbinski,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,300000000.0,2007.0
2,Sam Mendes,Spectre,bomb|espionage|sequel|spy|terrorist,245000000.0,2015.0
3,Christopher Nolan,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,250000000.0,2012.0
4,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,


## find duplicate rows in Pandas

**subset** : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by default use all of the columns

**keep** : {‘first’, ‘last’, False}, default ‘first’
* first : Mark duplicates as True except for the first occurrence.
* last : Mark duplicates as True except for the last occurrence.
* False : Mark all duplicates as True.

In [2]:
movies.shape

(5043, 5)

In [3]:
movies[movies.duplicated(keep='first')].shape

(123, 5)

## find duplicate values in a column

In [4]:
movies[movies.movie_title.duplicated(keep='first')].shape

(127, 5)

In [5]:
movies[movies.movie_title.duplicated(keep='first')].head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
137,David Yates,The Legend of Tarzan,africa|capture|jungle|male objectification|tarzan,180000000.0,2016.0
187,Bill Condon,The Twilight Saga: Breaking Dawn - Part 2,battle|friend|super strength|vampire|vision,120000000.0,2012.0
204,Hideaki Anno,Godzilla Resurgence,blood|godzilla|monster|sequel,,2016.0
303,Joe Wright,Pan,1940s|child hero|fantasy world|orphan|referenc...,150000000.0,2015.0
389,Josh Trank,Fantastic Four,box office flop|critically bashed|portal|telep...,120000000.0,2015.0


In [6]:
movies[movies.movie_title.duplicated(keep='last')].shape

(127, 5)

In [7]:
movies[movies.movie_title.duplicated(keep=False)].shape

(247, 5)

In [8]:
movies[movies.movie_title == 'Ben-Hur']

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
367,Timur Bekmambetov,Ben-Hur,,,2016.0
2613,Timur Bekmambetov,Ben-Hur,chariot race|epic|false accusation|jerusalem|s...,100000000.0,2016.0
3967,Timur Bekmambetov,Ben-Hur,chariot race|epic|false accusation|jerusalem|s...,100000000.0,2016.0


In [9]:
movies[movies.movie_title == 'The Legend of Tarzan']

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
63,David Yates,The Legend of Tarzan,africa|capture|jungle|male objectification|tarzan,180000000.0,2016.0
137,David Yates,The Legend of Tarzan,africa|capture|jungle|male objectification|tarzan,180000000.0,2016.0


## find duplicate values in several columns

In [10]:
movies[movies.duplicated(subset=['movie_title', 'title_year'], keep=False)].head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
6,Sam Raimi,Spider-Man 3,sandman|spider man|symbiote|venom|villain,258000000.0,2007.0
17,Joss Whedon,The Avengers,alien invasion|assassin|battle|iron man|soldier,220000000.0,2012.0
25,Peter Jackson,King Kong,animal name in title|ape abducts a woman|goril...,207000000.0,2005.0
30,Sam Mendes,Skyfall,brawl|childhood home|computer cracker|intellig...,200000000.0,2012.0
33,Tim Burton,Alice in Wonderland,alice in wonderland|mistaking reality for drea...,200000000.0,2010.0


In [11]:
movies[movies.duplicated(subset=['movie_title', 'director_name', 'budget'], keep=False)].head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
6,Sam Raimi,Spider-Man 3,sandman|spider man|symbiote|venom|villain,258000000.0,2007.0
17,Joss Whedon,The Avengers,alien invasion|assassin|battle|iron man|soldier,220000000.0,2012.0
25,Peter Jackson,King Kong,animal name in title|ape abducts a woman|goril...,207000000.0,2005.0
30,Sam Mendes,Skyfall,brawl|childhood home|computer cracker|intellig...,200000000.0,2012.0
33,Tim Burton,Alice in Wonderland,alice in wonderland|mistaking reality for drea...,200000000.0,2010.0


## Drop duplicates

In [12]:
movies.shape

(5043, 5)

In [13]:
movies.drop_duplicates(keep='first', inplace=True)

In [14]:
movies.shape

(4920, 5)

In [15]:
movies.drop_duplicates(subset=['movie_title', 'director_name'], keep=False, inplace=True)

In [16]:
movies.shape

(4918, 5)

## find duplicate data in a index

In [17]:
df = pd.DataFrame({"X":["A", "XX", "B", "C"], "Y":[11,"XX",11,12], "Z":["X","XX","Y","X"], 0:[0,1,1,2]})
df.set_index(0, inplace=True)

In [18]:
df.head()

Unnamed: 0_level_0,X,Y,Z
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A,11,X
1,XX,XX,XX
1,B,11,Y
2,C,12,X


In [22]:
df[df.index.duplicated(keep=False)]

Unnamed: 0_level_0,X,Y,Z
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,XX,XX,XX
1,B,11,Y


In [23]:
df = df[~df.index.duplicated(keep='last')]

In [24]:
df.head()

Unnamed: 0_level_0,X,Y,Z
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A,11,X
1,B,11,Y
2,C,12,X


## find duplicate data in a row

In [25]:
df = pd.DataFrame({"X":["A", "XX", "B", "C"], "Y":[11,"XX",11,12], "Z":["X","XX","Y","X"]})

In [26]:
df.head()

Unnamed: 0,X,Y,Z
0,A,11,X
1,XX,XX,XX
2,B,11,Y
3,C,12,X


In [28]:
indexes = df.index
indexes

RangeIndex(start=0, stop=4, step=1)

In [29]:
df = df.T

In [30]:
df.head()

Unnamed: 0,0,1,2,3
X,A,XX,B,C
Y,11,XX,11,12
Z,X,XX,Y,X


In [31]:
df[df.duplicated(keep='first')].shape

(0, 4)

In [32]:
df[1].duplicated(keep='last')

X     True
Y     True
Z    False
Name: 1, dtype: bool

In [33]:
df[1].duplicated(keep='first')

X    False
Y     True
Z     True
Name: 1, dtype: bool

In [34]:
df[1].duplicated(keep=False)

X    True
Y    True
Z    True
Name: 1, dtype: bool

In [35]:
df[1].duplicated(keep=False).sum()

3

In [36]:
df.shape[0]

3

In [37]:
for i in indexes:
    print(df[i].duplicated(keep=False).sum())
    if df[i].duplicated(keep=False).sum() == df.shape[0]:
        df.drop(i, inplace=True, axis=1)


0
3
0
0


In [38]:
df

Unnamed: 0,0,2,3
X,A,B,C
Y,11,11,12
Z,X,Y,X


In [39]:
df.T

Unnamed: 0,X,Y,Z
0,A,11,X
2,B,11,Y
3,C,12,X
