In [1]:
import numpy as np
import pandas as pd

In [2]:
#.unique and .value_counts are two methods that tell us what values are contained
#in a Series or DataFrame
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [19]:
#first method is unique, which returns a list of items in the Series
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [20]:
#we can sort with uniques.sort()


['a', 'b', 'c', 'd']

In [21]:
#also the sorted() function
sorted(uniques)

['a', 'b', 'c', 'd']

In [22]:
#value counts creates a series that lists the quantity of each value
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [23]:
#also a top-level pandas method for Numpy arrays or lists
pd.value_counts(np.arange(2))

0    1
1    1
dtype: int64

In [25]:
pd.value_counts([i for i in range(2)])

0    1
1    1
dtype: int64

In [26]:
#.isin performs vectorized membership checks
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [27]:
mask = obj.isin(['b', 'c'])

In [28]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [32]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [35]:
obj.mask(mask, 0) #we can also call mask as a method that replaces the designated values with
#a value we specify, in this case 0

0    0
1    a
2    d
3    a
4    a
5    0
6    0
7    0
8    0
dtype: object

In [37]:
#we can also use where for the opposite result
obj.where(mask, 0) #only the values 'b' and 'c' are included

0    c
1    0
2    0
3    0
4    0
5    b
6    b
7    c
8    c
dtype: object

In [38]:
#Index.get_indexer method: takes an array of possibly nondistinct values and returns an
#index array with distinct values
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [39]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [40]:
unique_vals

0    c
1    b
2    a
dtype: object

In [41]:
indices = pd.Index(unique_vals).get_indexer(to_match)
indices

array([0, 2, 1, 1, 0, 2])

In [42]:
#histogram on columns in a DataFrame:
#sometimes we might want to take a DataFrame's columns and make a histogram
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                    'Qu2': [2,3,1,2,3],
                    'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [43]:
#compute value counts for a single column
data['Qu1'].value_counts().sort_index()

1    1
3    2
4    2
Name: Qu1, dtype: int64

In [47]:
#compute value counts for a single column (starting at the maximum with ascending=False)
data['Qu1'].value_counts().sort_index(ascending=False)

4    2
3    2
1    1
Name: Qu1, dtype: int64

In [50]:
#to compute for every column:
data.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [52]:
#to compute for every column:
#the row labels are the result of the distinct values in all the columns
#the values are the number of counts of that value in a particular column
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [53]:
#df.value_counts() with each row as a tuple, how many unique rows are there
data = pd.DataFrame({'a': [1, 1, 1, 2, 2],
                    'b': [0, 0, 1, 0, 0]})
data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [54]:
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
dtype: int64