# Chapter 5: Getting Started with pandas

In [1]:
import numpy as np

import pandas as pd

## 5.1 Introduction to pandas Data Structures

### Series

In [7]:
obj = pd.Series(np.arange(0, 21, 2))

In [8]:
obj

0      0
1      2
2      4
3      6
4      8
5     10
6     12
7     14
8     16
9     18
10    20
dtype: int64

In [13]:
# An array form of the Series:

obj.array

<NumpyExtensionArray>
[ np.int64(0),  np.int64(2),  np.int64(4),  np.int64(6),  np.int64(8),
 np.int64(10), np.int64(12), np.int64(14), np.int64(16), np.int64(18),
 np.int64(20)]
Length: 11, dtype: int64

In [14]:
# Index object of the Series:

obj.index

RangeIndex(start=0, stop=11, step=1)

In [43]:
# A Series of random no.s with an index identifying each data point with a label:

obj = pd.Series(np.random.randint(1, 11, size=10), index=list("abcdefghij"))

obj

a     9
b     8
c     7
d     4
e     6
f     1
g    10
h     6
i     2
j     9
dtype: int32

In [40]:
obj.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [46]:
# Using labels in the index to select values:

obj["a"]

np.int32(9)

In [49]:
# Here ['a', 'b', 'c', 'd'] is interpreted as a list of indices, even though they're strings instead of integers:

obj[['a', 'b', 'c', 'd']]

a    9
b    8
c    7
d    4
dtype: int32

In [56]:
# Any operations performed on the Series will not affect the index:

obj[obj <= 5]

d    4
f    1
i    2
dtype: int32

In [54]:
obj * 2

a    18
b    16
c    14
d     8
e    12
f     2
g    20
h    12
i     4
j    18
dtype: int32

In [55]:
np.exp(obj)

a     8103.083928
b     2980.957987
c     1096.633158
d       54.598150
e      403.428793
f        2.718282
g    22026.465795
h      403.428793
i        7.389056
j     8103.083928
dtype: float64

In [63]:
# Another way to think about a Series is a Dictionary:

"f" in obj

True

In [62]:
"m" in obj

False

In [68]:
# If you have a data in a Dict then you can create it in a Series (or a DataFrame):

sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

obj3 = pd.Series(sdata)

obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [66]:
# Can also convert it back to a Dict:

obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [77]:
# Changes the index from it's default:

states = ["California", "Ohio", "Oregon", "Texas"]

obj4 = pd.Series(sdata, index=states)

obj4

# Since "California" doesn't match with the data it's considered as NaN:

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [83]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [84]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [85]:
# As instance method:

obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [86]:
obj4.notna()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [88]:
# Series automatically aligns with other indexes in arithmetic operations:
# Similar to a join operation:

obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [92]:
# Both Series and its index itself can have a "name":

obj4.name = "population"

obj4.index.name = "state"

obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [94]:
# A Series' index can be altered by alignment:

obj

a     9
b     8
c     7
d     4
e     6
f     1
g    10
h     6
i     2
j     9
dtype: int32

In [96]:
obj.index = list("klmnopqrst")

obj

k     9
l     8
m     7
n     4
o     6
p     1
q    10
r     6
s     2
t     9
dtype: int32

### DataFrame

In [99]:
# The DataFrame has both a row and column index:

data = {
    "state" : ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year" : [2000, 2001, 2002, 2001, 2002, 2003],
    "pop" : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

df = pd.DataFrame(data)

df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [106]:
# First five rows:

df.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [105]:
# Last five rows:

df.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [104]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:

pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [109]:
# If you pass a column name that isn't in the data it'll appear with missing values:

df2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])

df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [110]:
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [114]:
# Dictionary-like notation:

df2["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [116]:
# dot attribute notation (only accessible if there's no whitespaces):

df2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [119]:
# Rows can also be retrieved by position or name with the special iloc and loc attributes:

df2.iloc[4]    # index position

year       2002
state    Nevada
pop         2.9
debt        NaN
Name: 4, dtype: object

In [121]:
df2.loc[2]     # row "name"

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [122]:
# Columns' values can be modified by assignment:
# Scalar values:

df2['debt'] = 69

df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,69
1,2001,Ohio,1.7,69
2,2002,Ohio,3.6,69
3,2001,Nevada,2.4,69
4,2002,Nevada,2.9,69
5,2003,Nevada,3.2,69


In [129]:
# An array of values (it must match the length of the df):

df2["debt"] = np.random.randint(1,6, size=6)

df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,4
1,2001,Ohio,1.7,5
2,2002,Ohio,3.6,4
3,2001,Nevada,2.4,2
4,2002,Nevada,2.9,1
5,2003,Nevada,3.2,1


In [130]:
# While assigning a Series' values to an empty column, it's index will allign with the df:

val = pd.Series([3, 6, 9], index=[1, 3, 5])

df2["debt"] = val

df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,3.0
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,6.0
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,9.0


In [136]:
# Creating a new Boolean column with True where ever there's a specified value:

df2["eastern"] = df2["state"] == "Ohio"

df2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,3.0,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,6.0,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,9.0,False


In [137]:
# Deleting a column:

del df2["eastern"]

In [138]:
df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,3.0
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,6.0
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,9.0


In [139]:
# Another common form of data is a nested dictionary of dictionaries:

populations = {
    "Ohio" : {2000: 1.5, 2001: 1.7, 2002: 1.3},
    "Nevada" : {2001: 1.2, 2002: 2.1}
}

In [140]:
# pandas will interpret the outer keys as columns and the inner key as row indices:

df3 = pd.DataFrame(populations)

df3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,1.2
2002,1.3,2.1


In [142]:
# Transposing (swapping rows and columns):
# Note that transposing discards the column data types. The columns become arrays of pure Python objects in this case.

df3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,1.3
Nevada,,1.2,2.1


In [143]:
# The inner keys are combined to form the index unless the index is specified differently:

pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,1.2
2002,1.3,2.1
2003,,


In [145]:
# If a DataFrame’s index and columns have their name attributes set, these will also be displayed:

df3.index.name = "year"

df3.columns.name = "state"

df3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,1.2
2002,1.3,2.1


In [147]:
# Unlike Series the DataFrame can't have a name attribute.

# to_numpy method converts the df into a 2d array:

df3.to_numpy()

array([[1.5, nan],
       [1.7, 1.2],
       [1.3, 2.1]])

In [149]:
# If the columns are of different data types, the returned array will be chosen to accommodate all of the columns:

df2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, 3.0],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, 6.0],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, 9.0]], dtype=object)

### Index Objects

In [150]:
# Index objects are responsible for holding the axis labels

# Any array or sequence you use when constructing a df is internally converted to an Index:

obj = pd.Series(np.random.randint(1,10, size=3), index=list("abc"))

obj

a    9
b    1
c    8
dtype: int32

In [151]:
index = obj.index

index

Index(['a', 'b', 'c'], dtype='object')

In [153]:
index[1:]

Index(['b', 'c'], dtype='object')

In [154]:
# Index objects are immutable:

index[1] = "d"

TypeError: Index does not support mutable operations

In [155]:
# Immutability makes it safer to share index among data structures:

labels = pd.Index(np.arange(3))

labels

Index([0, 1, 2], dtype='int64')

In [156]:
obj2 = pd.Series([1.5, 3, 4.5], index=labels)

obj2

0    1.5
1    3.0
2    4.5
dtype: float64

In [159]:
labels is obj2.index

True

In [160]:
# In addition to being array-like, an Index also behaves like a fixed-size set:

df3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,1.2
2002,1.3,2.1


In [161]:
"Ohio" in df3.columns

True

In [162]:
2002 in df3.index

True

In [163]:
2003 in df3.index

False

In [165]:
# Unlike sets, an Index can contain duplicate labels:
# Selections with duplicate labels will select all occurrences of that label:

pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## 5.2 Essential Functionality

### Reindexing

In [9]:
# To create a new object with the values rearranged to align with the new index:

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=list("dbac"))

obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [6]:
# Calling "reindex" rearranges the data according to the new index, with missing values for new index:

obj2 = obj.reindex(list("abcde"))

obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [11]:
obj3 = pd.Series(["blue", "yellow", "purple", "pink"], index=[0, 2, 4, 6])

obj3

0      blue
2    yellow
4    purple
6      pink
dtype: object

In [16]:
# The method="ffil" (forward fill) function can fill the missing values based on its previous values:

obj3.reindex(np.arange(6), method="ffill")

0      blue
1      blue
2    yellow
3    yellow
4    purple
5    purple
dtype: object

In [27]:
# With df, reindex can alter both rows(index) and columns:

df = pd.DataFrame(np.arange(1, 10).reshape(3,3), index=["a", "c", "d"],
                  columns=["Ohio", "Texas", "California"])

df

Unnamed: 0,Ohio,Texas,California
a,1,2,3
c,4,5,6
d,7,8,9


In [28]:
# When only a sequence is passed it reindexes the rows:

df.reindex(list("abcd"))

Unnamed: 0,Ohio,Texas,California
a,1.0,2.0,3.0
b,,,
c,4.0,5.0,6.0
d,7.0,8.0,9.0


In [30]:
# The columns can be reindexed with the columns keyword:

# Since "Ohio" wasn't in list, the column is dropped entirely:

df.reindex(columns=["Texas", "Utah", "California"])

Unnamed: 0,Texas,Utah,California
a,2,,3
c,5,,6
d,8,,9


In [35]:
# Another way to alter rows or colums is by axis argument:

df.reindex(["Texas", "Utah", "California"], axis=1)     # axis="columns"

Unnamed: 0,Texas,Utah,California
a,2,,3
c,5,,6
d,8,,9


In [39]:
df

Unnamed: 0,Ohio,Texas,California
a,1,2,3
c,4,5,6
d,7,8,9


In [42]:
# The "loc" reindexing works only if all the new index labels already exist in the df:
# whereas reindex inserts missing values for new labels:

df.loc[["a", "d", "c"], ["California", "Ohio"]]

Unnamed: 0,California,Ohio
a,3,1
d,9,7
c,6,4


### Dropping Entries from an Axis

In [49]:
# Dropping entries with "reindex" or "loc" requires a list without those entries

# The drop method will return a new object with the indicated value or values deleted from an axis:

obj = pd.Series(np.arange(1., 6.), index=list("abcde"))

obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
dtype: float64

In [46]:
obj.drop("c")

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [50]:
obj.drop(["b", "d"])

a    1.0
c    3.0
e    5.0
dtype: float64

In [51]:
df = pd.DataFrame(np.arange(1, 17).reshape(4, 4), index=["Ohio", "Colarado", "Utah", "New York"],
                  columns=["one", "two", "three", "four"])

df

Unnamed: 0,one,two,three,four
Ohio,1,2,3,4
Colarado,5,6,7,8
Utah,9,10,11,12
New York,13,14,15,16


In [58]:
# Calling drop without the axis drops rows values (axis=0):

df.drop(["Colarado", "New York"])

Unnamed: 0,one,two,three,four
Ohio,1,2,3,4
Utah,9,10,11,12


In [59]:
# Calling drop with axis=1 drops values from columns:

df.drop(["two", "four"], axis=1)

Unnamed: 0,one,three
Ohio,1,3
Colarado,5,7
Utah,9,11
New York,13,15


In [56]:
# Another way:

df.drop(["one", "three"], axis="columns")

Unnamed: 0,two,four
Ohio,2,4
Colarado,6,8
Utah,10,12
New York,14,16


### Indexing, Selection, and Filtering

In [4]:
# Series indexing works the same way as NumPy, except you can also use the Series's index values:

obj = pd.Series(np.arange(4.), index=list("abcd"))

obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [5]:
obj["b"]

np.float64(1.0)

In [8]:
obj[1]

  obj[1]


np.float64(1.0)

In [9]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [10]:
obj[["b", "a", "d"]]

b    1.0
a    0.0
d    3.0
dtype: float64

In [12]:
obj[obj >= 2]

c    2.0
d    3.0
dtype: float64

In [14]:
# The preferred way to select index values is with the special loc operator:

obj.loc[["b", "a", "d"]]

b    1.0
a    0.0
d    3.0
dtype: float64

In [15]:
obj.iloc[[1, 0, 3]]

b    1.0
a    0.0
d    3.0
dtype: float64

In [19]:
# Regular []-based indexing will treat integers as labels if the index contains integers:

obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])

obj1

2    1
0    2
1    3
dtype: int64

In [20]:
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])

obj2

a    1
b    2
c    3
dtype: int64

In [26]:
# This assumes the index list as labels:

obj1[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [25]:
obj2[[0, 1, 2]]

  obj2[[0, 1, 2]]


a    1
b    2
c    3
dtype: int64

In [27]:
# When using loc, the expression obj.loc[[0, 1, 2]] will fail when the index does not contain integers:

obj2.loc[[0, 1, 2]]

KeyError: "None of [Index([0, 1, 2], dtype='int64')] are in the [index]"

In [28]:
# The iloc operator indexes only with integers:

obj1.iloc[[0, 1, 2]]

2    1
0    2
1    3
dtype: int64

In [29]:
obj2.iloc[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

In [32]:
# Slicing with labels (the endpoint is inclusive):

obj.loc["a":"d"]

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [37]:
# Assigning values modifies the Series:

obj2.loc["b":"d"] = 5

obj2

a    1
b    5
c    5
dtype: int64

In [38]:
df = pd.DataFrame(np.arange(1, 17).reshape(4,4), index=["Ohio", "Colarado", "Utah", "New York"],
                  columns=["one", "two", "three", "four"])

df

Unnamed: 0,one,two,three,four
Ohio,1,2,3,4
Colarado,5,6,7,8
Utah,9,10,11,12
New York,13,14,15,16


In [45]:
# Indexing columns:

df["one"]

Ohio         1
Colarado     5
Utah         9
New York    13
Name: one, dtype: int64

In [52]:
df[["two", "four"]]

Unnamed: 0,two,four
Ohio,2,4
Colarado,6,8
Utah,10,12
New York,14,16


In [49]:
# Indexing rows

df[1:3]

Unnamed: 0,one,two,three,four
Colarado,5,6,7,8
Utah,9,10,11,12


In [56]:
# Slicing or selecting data with a Boolean array:

df[df["three"] > 5]

Unnamed: 0,one,two,three,four
Colarado,5,6,7,8
Utah,9,10,11,12
New York,13,14,15,16


In [65]:
# DataFrame with all Boolean values produced by comparing with a scalar value:

df < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colarado,False,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [66]:
# Assigning 0 to each location where there's a True

df[df < 5] = 0

df

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colarado,5,6,7,8
Utah,9,10,11,12
New York,13,14,15,16


In [72]:
# Selecting rows:

df.loc["Colarado"]

one      5
two      6
three    7
four     8
Name: Colarado, dtype: int64

In [70]:
df.loc[["Colarado", "New York"]]

Unnamed: 0,one,two,three,four
Colarado,5,6,7,8
New York,13,14,15,16


In [75]:
# To select both rows and columns pass the selctions with a comma df[["rows"], ["columns"]]:

df.loc[["Ohio", 'Utah'], ["one", "three"]]

Unnamed: 0,one,three
Ohio,0,0
Utah,9,11


In [76]:
# Using iloc:

df.iloc[3]

one      13
two      14
three    15
four     16
Name: New York, dtype: int64

In [78]:
df.iloc[[0, 2]]

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Utah,9,10,11,12


In [81]:
# Selecting both with iloc:

df.iloc[[1, 3], [0, 2]]

Unnamed: 0,one,three
Colarado,5,7
New York,13,15


In [85]:
# Slicing with indexing (:[1, 2])

df.loc[:"Utah", ["one", "two"]]

Unnamed: 0,one,two
Ohio,0,0
Colarado,5,6
Utah,9,10


In [91]:
# Slicing and indexing along with filtering: 

df.iloc[:, [0, 3]][df["one"] > 5]

Unnamed: 0,one,four
Utah,9,12
New York,13,16


In [95]:
# Boolean arrays can be used with loc but not iloc:

df.loc[df["three"] > 7]

Unnamed: 0,one,two,three,four
Utah,9,10,11,12
New York,13,14,15,16


#### Integer indexing pitfalls

In [96]:
ser = pd.Series(np.arange(3.))

ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [106]:
# Error occurs again for the same reason that pandas doesn't know if it's label-based indexing or position-based:
# Although this would work (-1) if indexes were labeled(a,b,c):

ser[-1]

KeyError: -1

In [100]:
# Use loc or iloc to get exactly what you want:

ser.iloc[-1]

np.float64(2.0)

In [105]:
# Slicing is always integer oriented so no need to use loc or iloc:

ser[:2]

0    0.0
1    1.0
dtype: float64

In [107]:
# Use loc or iloc to avoid ambiguity.

#### Pitfalls with chained indexing

In [110]:
# Assigning values:

df.loc[:, "one"] = 1

df

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colarado,1,6,7,8
Utah,1,10,11,12
New York,1,14,15,16


In [112]:
df.loc["Utah"] = 5

df

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colarado,1,6,7,8
Utah,5,5,5,5
New York,1,14,15,16


In [113]:
df.loc[df["four"] > 5] = 3

df

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colarado,3,3,3,3
Utah,5,5,5,5
New York,3,3,3,3


In [114]:
df.loc[df["three"] == 5]["three"] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df["three"] == 5]["three"] = 6


In [116]:
df.loc[df["three"] == 5, "three"] = 6

df

Unnamed: 0,one,two,three,four
Ohio,1,0,0,0
Colarado,3,3,3,3
Utah,5,5,6,5
New York,3,3,3,3


### Arithematic and Data Alignment

In [2]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=list("acde"))

s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [3]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=list("acefg"))

s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [5]:
# The label locations that don't overlap becomes missing values (NaN):

s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [7]:
# For DataFrame:

df1 = pd.DataFrame(np.arange(1., 10.).reshape(3,3), index=["Ohio", "Texas", "Colarado"],
                   columns=list("bcd"))

df1

Unnamed: 0,b,c,d
Ohio,1.0,2.0,3.0
Texas,4.0,5.0,6.0
Colarado,7.0,8.0,9.0


In [10]:
df2 = pd.DataFrame(np.arange(1., 13.).reshape(4,3), index=["Utah", "Ohio", "Texas", "Oregon"],
                   columns=list("bde"))

df2

Unnamed: 0,b,d,e
Utah,1.0,2.0,3.0
Ohio,4.0,5.0,6.0
Texas,7.0,8.0,9.0
Oregon,10.0,11.0,12.0


In [12]:
# Only the rows and columns that are common on both dfs will appear:

df1 + df2

Unnamed: 0,b,c,d,e
Colarado,,,,
Ohio,5.0,,8.0,
Oregon,,,,
Texas,11.0,,14.0,
Utah,,,,


In [13]:
df1 = pd.DataFrame({"A": [1, 2]})

df1

Unnamed: 0,A
0,1
1,2


In [14]:
df2 = pd.DataFrame({"B": [3, 4]})

df2

Unnamed: 0,B
0,3
1,4


In [16]:
# If no column or a row is common, the result will contain all null values:

df1 + df2

Unnamed: 0,A,B
0,,
1,,


#### Arithematic methods with fill values

In [18]:
# We can fill the missing value with 0 so the result is any one value (df1(value) + 0):

df1 = pd.DataFrame(np.arange(1., 13.).reshape(3, 4), columns=list("abcd"))

df1

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,12.0


In [21]:
df2 = pd.DataFrame(np.arange(1., 21.).reshape(4, 5), columns=list("abcde"))

df2.loc[2, "c"] = np.nan

df2

Unnamed: 0,a,b,c,d,e
0,1.0,2.0,3.0,4.0,5.0
1,6.0,7.0,8.0,9.0,10.0
2,11.0,12.0,,14.0,15.0
3,16.0,17.0,18.0,19.0,20.0


In [23]:
# The result if there's no common row or column is NaN:

df1 + df2

Unnamed: 0,a,b,c,d,e
0,2.0,4.0,6.0,8.0,
1,11.0,13.0,15.0,17.0,
2,20.0,22.0,,26.0,
3,,,,,


In [25]:
# The result if there's no common row or column if just keeping the value:

df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,2.0,4.0,6.0,8.0,5.0
1,11.0,13.0,15.0,17.0,10.0
2,20.0,22.0,11.0,26.0,15.0
3,16.0,17.0,18.0,19.0,20.0


In [30]:
# fill_value = 1 for multiplication:

df1.mul(df2, fill_value=1)

Unnamed: 0,a,b,c,d,e
0,1.0,4.0,9.0,16.0,5.0
1,30.0,42.0,56.0,72.0,10.0
2,99.0,120.0,11.0,168.0,15.0
3,16.0,17.0,18.0,19.0,20.0


In [32]:
# Also when reindexing the NaN values can be changes for new columns or rows:

df1.reindex(list("abcde"), fill_value=0, axis=1)

Unnamed: 0,a,b,c,d,e
0,1.0,2.0,3.0,4.0,0
1,5.0,6.0,7.0,8.0,0
2,9.0,10.0,11.0,12.0,0


#### Operations between DataFrame and Series

In [45]:
# The difference between a two-dimensional array and one of its rows:

arr = np.arange(12.).reshape(3, 4)

arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [42]:
arr[0]

array([0., 1., 2., 3.])

In [46]:
# The subtraction is performed once for each row. This is referred to as broadcasting:

arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [47]:
# Operations between a DataFrame and a Series are similar:

df = pd.DataFrame(np.arange(12.).reshape(4, 3), index=["Utah", "Ohio", "Texas", "Oregon"],
                  columns=list("bde"))

df

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [50]:
series = df.loc["Utah"]

series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [53]:
# Arithmetic between DataFrame and Series matches the index of the Series on the columns of the DataFrame:

df - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [56]:
# New columns will be created and reindexed to create a union:

series2 = pd.Series(np.arange(3.), index=list("bef"))

series2

b    0.0
e    1.0
f    2.0
dtype: float64

In [55]:
df - series2

Unnamed: 0,b,d,e,f
Utah,0.0,,1.0,
Ohio,3.0,,4.0,
Texas,6.0,,7.0,
Oregon,9.0,,10.0,


In [57]:
series3 = df["d"]

series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [65]:
# The axis that you pass is the axis that you want to match on:

df.sub(series3, axis="index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

In [69]:
# NumPy ufuncs (element-wise array methods) also work with pandas objects:

df = pd.DataFrame(np.random.standard_normal((4, 3)), columns=list("bde"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])

df

Unnamed: 0,b,d,e
Utah,-0.951909,-0.786862,0.681383
Ohio,2.198861,-1.091415,-0.044632
Texas,-1.696547,-0.587466,-0.151345
Oregon,-0.07539,0.552271,0.550573


In [75]:
df = np.abs(df)

df

Unnamed: 0,b,d,e
Utah,0.951909,0.786862,0.681383
Ohio,2.198861,1.091415,0.044632
Texas,1.696547,0.587466,0.151345
Oregon,0.07539,0.552271,0.550573


In [71]:
# Applying a function on one-dimensional arrays to each column or row:

def f1(x):
    return x.max() - x.min()

In [78]:
# DataFrame’s apply method:

# Computes the difference between the maximum and minimum of each column:

df.apply(f1)     # applies across the rows

b    2.123471
d    0.539143
e    0.636751
dtype: float64

In [79]:
# Difference between each row:

df.apply(f1, axis="columns")     # applies across the columns

Utah      0.270526
Ohio      2.154229
Texas     1.545202
Oregon    0.476881
dtype: float64

In [80]:
# Many of the most common array statistics (like sum and mean) are DataFrame methods, so using apply is not necessary.

In [83]:
# Not necessary that apply should return scalar values, it can also return a Series:

def f2(x):
    return pd.Series([x.max(), x.min()], index=["max", "min"])

In [84]:
df.apply(f2)

Unnamed: 0,b,d,e
max,2.198861,1.091415,0.681383
min,0.07539,0.552271,0.044632


In [86]:
def my_format(x):
    return f"{x:.2f}"

In [94]:
# For element-wise Python functions you need to use map():

df.map(my_format)

Unnamed: 0,b,d,e
Utah,0.95,0.79,0.68
Ohio,2.2,1.09,0.04
Texas,1.7,0.59,0.15
Oregon,0.08,0.55,0.55


### Sorting and Ranking

In [95]:
obj = pd.Series(np.arange(4.), index=list("dabc"))

obj

d    0.0
a    1.0
b    2.0
c    3.0
dtype: float64

In [99]:
# Returns a new sorted object:

obj.sort_index()

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64

In [103]:
df = pd.DataFrame(np.arange(8.).reshape(2, 4), index=["three", "one"], columns=list("dabc"))

df

Unnamed: 0,d,a,b,c
three,0.0,1.0,2.0,3.0
one,4.0,5.0,6.0,7.0


In [109]:
df.sort_index()

Unnamed: 0,d,a,b,c
one,4.0,5.0,6.0,7.0
three,0.0,1.0,2.0,3.0


In [110]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1.0,2.0,3.0,0.0
one,5.0,6.0,7.0,4.0


In [112]:
# Descending order of columns:

df.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0.0,3.0,2.0,1.0
one,4.0,7.0,6.0,5.0


In [130]:
# Sorting Series by its values:

obj = pd.Series(np.random.randint(-5, 6, size=10))

obj

0   -2
1    5
2    5
3   -2
4   -1
5   -1
6    3
7    0
8    4
9    5
dtype: int32

In [131]:
obj.sort_values()

0   -2
3   -2
5   -1
4   -1
7    0
6    3
8    4
2    5
1    5
9    5
dtype: int32

In [133]:
# Missing values (NaN) are sorted at the by default:

obj2 = pd.Series([4, np.nan, 7, np.nan, -3, 2])

obj2.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [138]:
# Missing values can be sorted at the start:

obj2.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [141]:
# Sorting only a single column's values:

df = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1], "c": [3, 2, 0, 1]})

df

Unnamed: 0,b,a,c
0,4,0,3
1,7,1,2
2,-3,0,0
3,2,1,1


In [142]:
df.sort_values("b")

Unnamed: 0,b,a,c
2,-3,0,0
3,2,1,1
0,4,0,3
1,7,1,2


In [144]:
# Multiple columns:

df.sort_values(["b", "c"])

Unnamed: 0,b,a,c
2,-3,0,0
3,2,1,1
0,4,0,3
1,7,1,2


In [147]:
# Ranking assigns ranks starting from the lowest value:

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [148]:
# By default, rank breaks ties by assigning each group the mean rank:

obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [153]:
# Ranks can also be assigned according to the order in which they’re observed in the data.

# Instead of using the average rank 6.5, they instead have been set to 6 and 7 because label based on the order:

obj.rank(method="first")    # other tie-braking methods: "average", "min", "max", "first", "dense"

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [152]:
# You can rank in descending order, too:

obj.rank(ascending=False, method="first")

0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [154]:
# DataFrame can compute ranks over the rows or the columns:

df = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})

df

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [158]:
df.rank()     # across the rows

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [157]:
df.rank(axis="columns")     # across the columns

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [159]:
# A Series with duplicate indices:

obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])

obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [161]:
# The is_unique property of the index can tell you whether or not its labels are unique:

obj.index.is_unique

False

In [167]:
# Indexing a label with multiple entries returns a Series, while single entries return a scalar value:

# This can make your code more complicated, as the output type from indexing can vary based on whether or not a label is repeated.

obj["a"]

a    0
a    1
dtype: int64

In [163]:
obj["c"]

np.int64(4)

In [168]:
# The same logic extends to indexing rows (or columns) in a DataFrame:

df = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])

df

Unnamed: 0,0,1,2
a,0.098816,1.050824,-0.537554
a,-1.457744,1.976516,-0.41751
b,1.192819,-0.276804,-0.092372
b,-0.256957,-0.42745,0.110542
c,-0.292717,-0.491442,1.163696


In [169]:
df.index.is_unique

False

In [173]:
# Returns a df:

df.loc["a"]

Unnamed: 0,0,1,2
a,0.098816,1.050824,-0.537554
a,-1.457744,1.976516,-0.41751


In [176]:
# Reurns a Series:

df.loc["c"]

0   -0.292717
1   -0.491442
2    1.163696
Name: c, dtype: float64

## 5.3 Summarizing and Computing Descriptive Statistics

In [3]:
# Most of the mathematical and statistical methods are reductions or summary statistics
# These methods extract a single value (like the sum or mean) from a Series, or a Series of values from the rows or columns of a df
# They have built-in handling for missing data

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list("abcd"), 
                  columns=["one", "two"])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [13]:
df.sum()     # sum of each column

one    9.25
two   -5.80
dtype: float64

In [10]:
df.sum(axis="columns")     # across the columns

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [16]:
# It skips the NA values as default
# This can be changed and any NA value present in a column or a row would result in NA entirely:

df.sum(axis="index", skipna=False)

one   NaN
two   NaN
dtype: float64

In [15]:
df.sum(axis="columns", skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [17]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [20]:
# Some aggregations, like mean, require at least one non-NA (valid) value to yield a value result:

df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [26]:
# Some methods, like idxmin and idxmax, return indirect statistics.

# Index positions where the minimum value is attained:

df.idxmin()

one    d
two    b
dtype: object

In [23]:
# Index positions where the maximum value is attained:

df.idxmax()

one    b
two    d
dtype: object

In [28]:
# Other methods are accumulations:

df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [30]:
# Some methods are neither reductions nor accumulations. 
# describe is one such example, producing multiple summary statistics in one shot:

df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [33]:
# On nonnumeric data, describe produces alternative summary statistics:

obj = pd.Series(list("aabc") * 4)

obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [32]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [36]:
# Number of non-NA values:

df.count()

one    3
two    2
dtype: int64

### Correlation and Covariance

In [None]:
# Covariance only indicates the direction of the relationship (positive or negative) 
# while correlation provides a standardized measure of both the direction and strength of the linear relationship, 
# ranging from -1 to +1. 


In [45]:
# Time-Series data:

price = pd.read_pickle("yahoo_price.pkl")

volume = pd.read_pickle("yahoo_volume.pkl")

In [43]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [44]:
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


In [47]:
# This function is mostly useful in the time-series data.
# pct_change() computes percentage changes from the immediately previous row:

returns = price.pct_change()

returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [53]:
# The cov method of "Series" computes the covariance of the overlapping, non-NA, aligned-by-index values in two Series

returns["AAPL"].cov(returns["MSFT"])     # positive covariance

np.float64(9.511357581430651e-05)

In [55]:
# Relatedly, corr computes the correlation:

returns["AAPL"].corr(returns["MSFT"])     # positive correlation

np.float64(0.38969458628057846)

In [59]:
# DataFrame’s cov and corr methods, on the other hand, return a full covariance or correlation matrix:

returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [61]:
# The correlation of a variable with itself is always 1:

returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [62]:
returns["MSFT"].corr(returns["IBM"])

np.float64(0.49976361144151155)

In [69]:
# Pair-wise correlations between a DataFrame’s columns or rows with another Series or DataFrame.
# Passing a Series returns a Series with the correlation value computed for each column:

returns.corrwith(returns["MSFT"])

AAPL    0.389695
GOOG    0.465919
IBM     0.499764
MSFT    1.000000
dtype: float64

In [72]:
# Passing a DataFrame computes the correlations of matching column names.
# Computes correlations of percent changes with volume:

returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [74]:
price.corrwith(volume)     # Negative correlation between price and volume

AAPL   -0.587317
GOOG   -0.577326
IBM    -0.353614
MSFT   -0.453355
dtype: float64

In [76]:
# Passing axis="columns" does things row-by-row instead:

returns.corrwith(volume, axis="columns")

Date
2010-01-04         NaN
2010-01-05    0.737298
2010-01-06    0.017069
2010-01-07    0.507614
2010-01-08   -0.779646
                ...   
2016-10-17   -0.881606
2016-10-18   -0.303369
2016-10-19   -0.970723
2016-10-20   -0.304414
2016-10-21    0.927824
Length: 1714, dtype: float64

### Unique Values, Value Counts, and Membership

In [77]:
# Extracts information about the values contained in a one-dimensional Series.

obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [88]:
# unique gives you an array of the unique values in a Series:

uniques = obj.unique()     # not returned in a sorted order

uniques.sort()     # hence sorting it after taking out unique values

uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [95]:
# value_counts computes a Series containing value frequencies:

obj.value_counts()     # sorts by value in descending but can be changed with sort=False

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [91]:
# can also be used with NumPy arrays or other Python sequences:

pd.value_counts(obj.to_numpy())

  pd.value_counts(obj.to_numpy())


c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [94]:
# Correct way:

pd.Series(obj.to_numpy()).value_counts(sort=False)

c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [97]:
# isin performs a vectorized set membership check
# can be useful in filtering a dataset down to a subset of values

obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [100]:
mask = obj.isin(["b", "c"])

mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [104]:
# Filtered a list of particular values of choice from a Series (can also be used on df):

obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [105]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])

to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [106]:
unique_vals = pd.Series(["c", "b", "a"])

unique_vals

0    c
1    b
2    a
dtype: object

In [117]:
# Gives you an index array from an array of possibly nondistinct values into another array of distinct values:

pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [119]:
# If you want to compute a histogram on multiple related columns:

df = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4], "Qu2": [2, 3, 1, 2, 3], "Qu3": [1, 5, 2, 4, 4]})

df

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [123]:
# Computes the value counts for a single column:

df["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

In [128]:
# Computes the values counts for all columns (apply method):

df.apply(pd.value_counts).fillna(0)     # the indexes are the distinct values occurring in all of the columns

  df.apply(pd.value_counts).fillna(0)     # the indexes are the distinct values occurring in all of the columns


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [129]:
df1 = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [0, 0, 1, 0, 0]})

df1

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [132]:
# Considers each row as a tuple to determine the number of occurrences of each distinct row:

df1.value_counts()     # The index represents distinct rows (tuple)

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64