In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25],
})
df


### Finding Unique Values

In [None]:
df['Sex'].unique()

In [None]:
df['Sex'].value_counts()

In [None]:
df['Sex'].replace('D', 'F')

In [None]:
df['Sex'].replace({'D': 'F', 'N': 'M'})

If you have many columns to replace, you could apply it at "DataFrame level":

In [None]:
df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

In [None]:
df[df['Age'] > 100]

In [None]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

In [None]:
df

### Duplicates

In [None]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [None]:
ambassadors

In [None]:
ambassadors.duplicated()

In [None]:
ambassadors.duplicated(keep='last')

In [None]:
ambassadors.duplicated(keep=False)

In [None]:
ambassadors.drop_duplicates()

In [None]:
ambassadors.drop_duplicates(keep='last')

In [None]:
ambassadors.drop_duplicates(keep=False)

### Duplicates in DataFrames

In [None]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [None]:
players

In [None]:
players.duplicated()

Again, conceptually, "duplicated" means "all the column values should be duplicates". We can customize this with the `subset` parameter:

In [None]:
players.duplicated(subset=['Name'])

And the same rules of `keep` still apply:

In [None]:
players.duplicated(subset=['Name'], keep='last')

`drop_duplicates` takes the same parameters:

In [None]:
players.drop_duplicates()

In [None]:
players.drop_duplicates(subset=['Name'])

In [None]:
players.drop_duplicates(subset=['Name'], keep='last')

In [None]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [None]:
df

You know that the single columns represent the values "year, Sex, Country and number of children", but it's all been grouped in the same column and separated by an underscore. Pandas has a convenient method named `split` that we can use in these situations:

In [None]:
df['Data'].str.split('_')

In [None]:
df['Data'].str.split('_', expand=True)

In [None]:
df = df['Data'].str.split('_', expand=True)

In [None]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

You can also check which columns contain a given value with the `contains` method:

In [None]:
df

In [None]:
df['Year'].str.contains('\?')

[`contains`](http://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.Series.str.contains.html) takes a regex/pattern as first value, so we need to escape the `?` symbol as it has a special meaning for these patterns. Regular letters don't need escaping:

In [None]:
df['Country'].str.contains('U')

Removing blank spaces (like in `'US '` or `'I  T'` can be achieved with `strip` (`lstrip` and `rstrip` also exist) or just `replace`:

In [None]:
df['Country'].str.strip()

In [None]:
df['Country'].str.replace(' ', '')

As we said, `replace` and `contains` take regex patterns, which can make it easier to replace values in bulk:

In [None]:
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))