In [1]:
import pandas as pd
import numpy as np

In [26]:
df = pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29,30,240,290,25]
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


<h3>Finding Unique Values</h3>

In [5]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [7]:
df['Sex'].value_counts()

F    2
D    1
M    1
?    1
Name: Sex, dtype: int64

In [8]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [9]:
df['Sex'].replace({'D':'F','N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [12]:
df.replace({
    'Sex':{'D':'F','N':'M'},
    'Age':{290:29}})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [27]:
df.loc[df['Age']>100,'Age']

2    240
3    290
Name: Age, dtype: int64

In [28]:
df.loc[df['Age']>100,'Age']= df.loc[df['Age']>100,'Age']/10

In [29]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


<h3>Duplicates</h3>

In [30]:
ambassadors = pd.Series(
    ['France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',],
    
    index = ['Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth ']
    )

In [32]:
ambassadors.to_frame()

Unnamed: 0,0
Gérard Araud,France
Kim Darroch,United Kingdom
Peter Westmacott,United Kingdom
Armando Varricchio,Italy
Peter Wittig,Germany
Peter Ammon,Germany
Klaus Scharioth,Germany


In [33]:
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [34]:
ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [35]:
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [36]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [37]:
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [39]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

<h3>Duplicates in Data Frames</h3>

In [40]:
players = pd.DataFrame({
    'Name':['Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',],
    
    'Pos':['SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [41]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [45]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [48]:
players.duplicated(subset='Name')
players['Name'].duplicated()

0    False
1    False
2     True
3    False
4     True
Name: Name, dtype: bool

In [49]:
players['Name'].duplicated(keep='last')

0     True
1    False
2     True
3    False
4    False
Name: Name, dtype: bool

In [50]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [51]:
players['Name'].drop_duplicates()

0        Kobe Bryant
1       LeBron James
3    Carmelo Anthony
Name: Name, dtype: object

In [52]:
players['Name'].drop_duplicates(keep='last')

1       LeBron James
3    Carmelo Anthony
4        Kobe Bryant
Name: Name, dtype: object

<h3>Text Handling</h3>

<h4>Split columns</h4>

In [70]:
df = pd.DataFrame({
    'Data':['1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'],
})

In [71]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [72]:
df = df['Data'].str.split('_',expand=True)


In [76]:
df.columns = ['Year', 'Sex', 'Country', 'No. of Children']

In [77]:
df

Unnamed: 0,Year,Sex,Country,No. of Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [82]:
df['Year'].str.contains('\?')#escape is used, regular letters not require escape

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [84]:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [85]:
df['Country'].str.replace(' ','')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [86]:
df['Year'].str.replace('\?','')

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object

In [88]:
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [89]:
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object