In [2]:
%pylab inline
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


## INDEX
* [Remove Duplicates](#duplicates)
* [map](#map)
    * [map with a dict](#map-with-a-dict)
    * [map with a function](#map-with-a-function)
* [apply](#apply)
    * [apply return a series](#apply-return-a-series)
    * [applymap](#applymap)
* [replace](#replace)
* [add single column](#add_single_column)

<a id="duplicates"></a>
## Remove Duplicates
* duplicated returns a boolean Series indicating whether each row is a duplicate or not
* drop_duplicates returns a DataFrame where the duplicated array is True
* drop_duplicates can specify a subset of columns to consider equality
* duplicated and drop_duplicates by default keep the first observed value combination.Passing take_last=True will return the last one

In [3]:
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,  
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [5]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [6]:
data.drop_duplicates("k1")

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [7]:
data.drop_duplicates(['k1', 'k2'], take_last=True)

Unnamed: 0,k1,k2
1,one,1
2,one,2
4,two,3
6,two,4


<a id="map"></a>
## map

In [24]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

### map with a dict

In [26]:
data['animal-1'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal-1
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### map with a function

In [27]:
data['animal-2'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal-1,animal-2
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,Pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,Bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


## apply

In [18]:
frame = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [20]:
f = lambda x: x.max() - x.min()
frame.apply(f)# different axis=0, so apply one each column

b    9
d    9
e    9
dtype: int64

In [22]:
frame.apply(f,axis=1)# apply one each row

Utah      2
Ohio      2
Texas     2
Oregon    2
dtype: int64

### apply return a series

In [28]:
frame.apply(lambda x: pd.Series([x.min(), x.max()], index=['min', 'max']))

Unnamed: 0,b,d,e
min,0,1,2
max,9,10,11


### applymap
Element-wise Python functions can be used with **applymap**.

In [29]:
frame.applymap(lambda x: "cell:%3.2f"%x)

Unnamed: 0,b,d,e
Utah,cell:0.00,cell:1.00,cell:2.00
Ohio,cell:3.00,cell:4.00,cell:5.00
Texas,cell:6.00,cell:7.00,cell:8.00
Oregon,cell:9.00,cell:10.00,cell:11.00


<a id="replace"></a>
## replace

In [9]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: float64

In [10]:
# replace single value
data.replace(-999,np.nan)

0       1
1     NaN
2       2
3     NaN
4   -1000
5       3
dtype: float64

In [11]:
# replace multiple values with same value at once
data.replace([-999, -1000], np.nan)

0     1
1   NaN
2     2
3   NaN
4   NaN
5     3
dtype: float64

In [12]:
# replace with different values
data.replace({-999: np.nan, -1000: 0})

0     1
1   NaN
2     2
3   NaN
4     0
5     3
dtype: float64

<a id="add_single_column"></a>
## add single column

In [13]:
df = pd.DataFrame(np.arange(1,10).reshape(3,3),                             
                  columns = ["a","b","c"],                             
                  index = ["record1","record2","record3"])
df

Unnamed: 0,a,b,c
record1,1,2,3
record2,4,5,6
record3,7,8,9


In [14]:
# use below method won't add the column
df.d = df.c**2
df

Unnamed: 0,a,b,c
record1,1,2,3
record2,4,5,6
record3,7,8,9


In [15]:
# but use below method, we can add a single column
df["d"] = df.c**2
df

Unnamed: 0,a,b,c,d
record1,1,2,3,9
record2,4,5,6,36
record3,7,8,9,81
