In [1]:
import course;course.header()

# Advanced Python Course 
## Mobi Heidelberg WS 2020/21
### by Christian Fufezan 

christian@fufezan.net

https://fufezan.net

<img src="./imgs/cc.png" alt="drawing" width="200" style="float: left;"/>


# Pandas level 1
Data wrangling 101

I'd like to say Pandas is numpy on steriods but it is actually much more.

Pandas is the data science solution for Python and it build ontop of the powerful numpy module.
However, Pandas offers elements that are much more intuitive or go beyond what numpy has ever provided.
Nevertheless, numpy is more performant in some cases (by a lot, yet remember when to optimize!) 

The perfect is the dead of the good.
 -- M. Gunner

Pandas was create [Wes McKinney](https://wesmckinney.com/pages/about.html) in the early 2008 at AQR capital management and I can recommend "Python for Data Analysis" from Wes, which was published via O'Reilly and "Pandas for Everyone" by Daniel Y. Chen. The following Pandas chapters are inspired by the books.

Pandas offers the two basic data structures
* Series
* Dataframes


In [3]:
import pandas as pd
import numpy as np

In [5]:
c = pd.Series(
    np.random.randn(4), 
    index=['r1', 'r2', 'r3', 'r4']
)
c

r1    0.780804
r2   -0.470563
r3    1.089913
r4    0.375409
dtype: float64

Selecting from Series works like a dict :)

In [6]:
c['r2']

-0.4705626404973415

In [7]:
mask = c > 0
mask

r1     True
r2    False
r3     True
r4     True
dtype: bool

In [8]:
c[mask]

r1    0.780804
r3    1.089913
r4    0.375409
dtype: float64

Masks can be additive!

In [9]:
mask2 = c < 1
c[mask & mask2]

r1    0.780804
r4    0.375409
dtype: float64

In [10]:
c * 10

r1     7.808043
r2    -4.705626
r3    10.899127
r4     3.754090
dtype: float64

In [11]:
np.exp(c)

r1    2.183228
r2    0.624651
r3    2.974014
r4    1.455587
dtype: float64

Remember to use numpy functions as much as possible so data remains on the "C side". More below!

Operations conserve index!

Series are like ordered Dicts!

In [11]:
'r1' in c

True

np.nan is the missing value indicator

In [12]:
d = pd.Series({'r1': np.nan, 'r2': 0.2, 'r3': 0.2, 'r4': 0.4})

In [13]:
d

r1    NaN
r2    0.2
r3    0.2
r4    0.4
dtype: float64

In [14]:
d.isna()

r1     True
r2    False
r3    False
r4    False
dtype: bool

In [15]:
# inverting with ~!
~d.isna()

r1    False
r2     True
r3     True
r4     True
dtype: bool

In [17]:
d.notnull()

r1    False
r2     True
r3     True
r4     True
dtype: bool

## indices are aligned automatically!

In [18]:
c

r1    0.780804
r2   -0.470563
r3    1.089913
r4    0.375409
dtype: float64

In [19]:
d = pd.Series(
    np.random.randn(4), 
    index=['r2', 'r3', 'r4', 'r5']
)
d

r2   -1.383775
r3   -1.329587
r4   -1.170031
r5   -1.428835
dtype: float64

In [20]:
c + d

r1         NaN
r2   -1.854337
r3   -0.239675
r4   -0.794622
r5         NaN
dtype: float64

## Renaming index

In [21]:
d.index = ['r1', 'r2', 'r3', 'r4']

In [22]:
c + d

r1   -0.602970
r2   -1.800150
r3   -0.080118
r4   -1.053426
dtype: float64

Naming things will help you to get your data organised better. Explicit is better than implicit! And remember to choose your names variable wisely - you will code read often than you write.  

In [23]:
d.index.name = "variable"
d.name = "probability"
d

variable
r1   -1.383775
r2   -1.329587
r3   -1.170031
r4   -1.428835
Name: probability, dtype: float64

In [24]:
d.reset_index()

Unnamed: 0,variable,probability
0,r1,-1.383775
1,r2,-1.329587
2,r3,-1.170031
3,r4,-1.428835


Turns it into a DataFrame as the index is now a series!

In [25]:
type(d.reset_index())

pandas.core.frame.DataFrame

# Data frames 
Data frames are the pandas 2d data containers (if there is only one index dimension). 
In principle data frames are a list of Series, whereas each row is a series. 

In [26]:
df = pd.DataFrame(
    [
        c, 
        d, # this one we named :)
        pd.Series(np.random.randn(4), index=['r2', 'r3', 'r4', 'r5'])
    ]
)
df

Unnamed: 0,r1,r2,r3,r4,r5
Unnamed 0,0.780804,-0.470563,1.089913,0.375409,
probability,-1.383775,-1.329587,-1.170031,-1.428835,
Unnamed 1,,-1.762979,-1.476588,0.94807,-0.821829


In [27]:
# accessing a value
df.loc['probability', 'r2']

-1.3295873140607783

Note: How pandas aligns your data automatically.

If you want each series to be treated as column, just transpose

DataFrames can be constructed in many different ways, see docu for more details
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame

In [28]:
df = df.T
df

Unnamed: 0,Unnamed 0,probability,Unnamed 1
r1,0.780804,-1.383775,
r2,-0.470563,-1.329587,-1.762979
r3,1.089913,-1.170031,-1.476588
r4,0.375409,-1.428835,0.94807
r5,,,-0.821829


Renaming columns in a data frame

In [29]:
df.columns = ['p1', 'p2', 'p3']
df

Unnamed: 0,p1,p2,p3
r1,0.780804,-1.383775,
r2,-0.470563,-1.329587,-1.762979
r3,1.089913,-1.170031,-1.476588
r4,0.375409,-1.428835,0.94807
r5,,,-0.821829


Dataframes can equally be named, for your sanity, name them :)

In [30]:
df.columns.name = "probability"
df.index.name = "variable"
df

probability,p1,p2,p3
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
r1,0.780804,-1.383775,
r2,-0.470563,-1.329587,-1.762979
r3,1.089913,-1.170031,-1.476588
r4,0.375409,-1.428835,0.94807
r5,,,-0.821829


Now that you feel happy in the pandas world, some modules/functions require numpy arrays, how do you convert them ?

In [31]:
np_df = df.values
np_df

array([[ 0.78080431, -1.38377481,         nan],
       [-0.47056264, -1.32958731, -1.76297912],
       [ 1.08991269, -1.17003064, -1.47658846],
       [ 0.37540905, -1.42883509,  0.94807041],
       [        nan,         nan, -0.82182859]])

In [32]:
type(np_df)

numpy.ndarray

If you need to work "longer" on the numpy side, I suggest to transform the pandas dataframe to a numpy recarray, as names are preserved; 

In [33]:
# np_df = df.values # 
np_df = df.to_records()
np_df

rec.array([('r1',  0.78080431, -1.38377481,         nan),
           ('r2', -0.47056264, -1.32958731, -1.76297912),
           ('r3',  1.08991269, -1.17003064, -1.47658846),
           ('r4',  0.37540905, -1.42883509,  0.94807041),
           ('r5',         nan,         nan, -0.82182859)],
          dtype=[('variable', 'O'), ('p1', '<f8'), ('p2', '<f8'), ('p3', '<f8')])

In [34]:
np_df['variable']

array(['r1', 'r2', 'r3', 'r4', 'r5'], dtype=object)

In [35]:
np_df[0]

('r1', 0.78080431, -1.38377481, nan)

In [36]:
np_df[0][2]

-1.3837748088896147

## C-side and Python side 

**Note**:
Regular Python floats live in the Python world - Numpy and Pandas live in the "C world", hence their fast vectorized operations. If you can avoid it, don't cast between the worlds! 

In [37]:
long_series = pd.Series(
    np.random.randn(1000000), 
)

In [38]:
%%timeit -n 1
a = long_series.to_list()  # to python list!
print(f"a is a {type(a)} now!")
pd.Series(a)

a is a <class 'list'> now!
a is a <class 'list'> now!
a is a <class 'list'> now!
a is a <class 'list'> now!
a is a <class 'list'> now!
a is a <class 'list'> now!
a is a <class 'list'> now!
195 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit -n 1
a = long_series.to_numpy()
print(f"a is a {type(a)} now!")
pd.Series(a)

a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
a is a <class 'numpy.ndarray'> now!
The slowest run took 4.02 times longer than the fastest. This could mean that an intermediate result is being cached.
312 µs ± 154 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Operations between DataFrame and Series

In [40]:
df_small = pd.DataFrame([c, d])
df_small

Unnamed: 0,r1,r2,r3,r4
Unnamed 0,0.780804,-0.470563,1.089913,0.375409
probability,-1.383775,-1.329587,-1.170031,-1.428835


In [41]:
c

r1    0.780804
r2   -0.470563
r3    1.089913
r4    0.375409
dtype: float64

In [42]:
df_small - c

Unnamed: 0,r1,r2,r3,r4
Unnamed 0,0.0,0.0,0.0,0.0
probability,-2.164579,-0.859025,-2.259943,-1.804244


Next time you want to normalize each row of a data frame, one can define the correction factors as a series and just e.g. subtract it. 

In [43]:
# renaming columns

In [44]:
df

probability,p1,p2,p3
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
r1,0.780804,-1.383775,
r2,-0.470563,-1.329587,-1.762979
r3,1.089913,-1.170031,-1.476588
r4,0.375409,-1.428835,0.94807
r5,,,-0.821829


In [45]:
df.rename(columns={'p1':'VLC'}, inplace=True)

In [46]:
df

probability,VLC,p2,p3
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
r1,0.780804,-1.383775,
r2,-0.470563,-1.329587,-1.762979
r3,1.089913,-1.170031,-1.476588
r4,0.375409,-1.428835,0.94807
r5,,,-0.821829


In [47]:
# subselecting a set of columns! 
df[["VLC", 'p2']]

probability,VLC,p2
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
r1,0.780804,-1.383775
r2,-0.470563,-1.329587
r3,1.089913,-1.170031
r4,0.375409,-1.428835
r5,,


**Note:**
This only creates a view of the data! 

# Pandas IO
Pandas comes with a wide array of input output modules see
https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

**NOTE:** reading xlsx is _much_ slower than csv

Your request: Scraping websites! 

Today with Pandas scraping wikipedia. In particular the oldest universities!

Alternatively beautiful soup https://www.crummy.com/software/BeautifulSoup/bs4/doc/ or Scrapy https://scrapy.org/

In [50]:
url = "https://en.wikipedia.org/wiki/List_of_oldest_universities_in_continuous_operation"

In [51]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [52]:
dfs = pd.read_html(url) # do you get SSL: CERTIFICATE_VERIFY_FAILED ?

In [53]:
len(dfs)

9

In [55]:
dfs[1]

Unnamed: 0_level_0,Year,University,Location,Location,Notes
Unnamed: 0_level_1,Year,University,Original,Current,Notes
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist..."
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...
5,1224 (1258),University of Naples Federico II,Kingdom of Sicily,"Naples, Italy","The first public university,[20] founded by Fr..."
6,1240–1357,University of Siena,Republic of Siena,"Siena, Italy",Originally founded in 1240 by the Commune of S...
7,1290,University of Coimbra[23],Kingdom of Portugal,"Coimbra, Portugal",It began its existence in Lisbon with the name...
8,1290,University of Macerata[23],Papal States,"Macerata, Italy","Founded in 1290, possibly as a private law sch..."
9,1293,University of Valladolid,Crown of Castile,"Valladolid, Spain",Founded in the late 13th century (first docume...


In [57]:
udf = dfs[1]

In [58]:
udf.columns

MultiIndex([(      'Year',       'Year'),
            ('University', 'University'),
            (  'Location',   'Original'),
            (  'Location',    'Current'),
            (     'Notes',      'Notes')],
           )

In [59]:
udf.columns = [ e[0] for e in udf.columns ]

In [60]:
udf.head(2)

Unnamed: 0,Year,University,Location,Location.1,Notes
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist..."


In [61]:
udf.columns = ['Year', 'University', 'H-Location', 'G-Location', 'Notes' ]

In [62]:
udf.head()

Unnamed: 0,Year,University,H-Location,G-Location,Notes
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist..."
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...


## Gather some basic information around the dataframe

In [63]:
udf.describe()

Unnamed: 0,Year,University,H-Location,G-Location,Notes
count,38,38,38,38,36
unique,34,38,23,38,36
top,1495,University of Glasgow,Holy Roman Empire,"Madrid, Spain",Originally founded in 1240 by the Commune of S...
freq,2,1,8,1,1


In [64]:
udf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        38 non-null     object
 1   University  38 non-null     object
 2   H-Location  38 non-null     object
 3   G-Location  38 non-null     object
 4   Notes       36 non-null     object
dtypes: object(5)
memory usage: 1.6+ KB


We need to clean-up the Year

Accessing the str properties!

In [65]:
udf['Year'].str.match(r'^(?P<year>[0-9]{4})')

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
29    True
30    True
31    True
32    True
33    True
34    True
35    True
36    True
37    True
Name: Year, dtype: bool

In [66]:
udf['year'] = udf.Year.str.extract(r'(?P<year>[0-9]{4})')

In [67]:
udf.head()

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222


In [68]:
udf.shape
# (rows, columns)

(38, 6)

One cannot visualize all columns straight away in jupyter :( However redefining some options helps!

In [69]:
pd.set_option("max_columns", 2000)

# Sorting

In [70]:
udf.head()

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222


In [71]:
udf.sort_values(['year'])

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222
5,1224 (1258),University of Naples Federico II,Kingdom of Sicily,"Naples, Italy","The first public university,[20] founded by Fr...",1224
6,1240–1357,University of Siena,Republic of Siena,"Siena, Italy",Originally founded in 1240 by the Commune of S...,1240
8,1290,University of Macerata[23],Papal States,"Macerata, Italy","Founded in 1290, possibly as a private law sch...",1290
7,1290,University of Coimbra[23],Kingdom of Portugal,"Coimbra, Portugal",It began its existence in Lisbon with the name...,1290
9,1293,University of Valladolid,Crown of Castile,"Valladolid, Spain",Founded in the late 13th century (first docume...,1293


Sort_values has kwargs like ascending = True|False and values are defined by a list, ie sort first by, then by ...

In [72]:
udf.sort_values(['year', 'G-Location'])

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222
5,1224 (1258),University of Naples Federico II,Kingdom of Sicily,"Naples, Italy","The first public university,[20] founded by Fr...",1224
6,1240–1357,University of Siena,Republic of Siena,"Siena, Italy",Originally founded in 1240 by the Commune of S...,1240
7,1290,University of Coimbra[23],Kingdom of Portugal,"Coimbra, Portugal",It began its existence in Lisbon with the name...,1290
8,1290,University of Macerata[23],Papal States,"Macerata, Italy","Founded in 1290, possibly as a private law sch...",1290
10,1293,Complutense University of Madrid,Crown of Castile,"Madrid, Spain",The University of Alcalá was founded by Sancho...,1293


Let split the G-location into city and country!

In [73]:
tmp_df = udf['G-Location'].str.split(",")
display(tmp_df.head())   # not quite what we want .. we want two columns!

0               [Bologna,  Italy]
1       [Oxford,  United Kingdom]
2             [Salamanca,  Spain]
3    [Cambridge,  United Kingdom]
4                 [Padua,  Italy]
Name: G-Location, dtype: object

How to get two columns?

In [74]:
tmp_df = udf['G-Location'].str.split(",", expand=True)
tmp_df.columns = ['G-City', 'G-Country']

In [75]:
tmp_df

Unnamed: 0,G-City,G-Country
0,Bologna,Italy
1,Oxford,United Kingdom
2,Salamanca,Spain
3,Cambridge,United Kingdom
4,Padua,Italy
5,Naples,Italy
6,Siena,Italy
7,Coimbra,Portugal
8,Macerata,Italy
9,Valladolid,Spain


In [76]:
udf = udf.join(tmp_df)
# there are many options to join frames 

In [77]:
udf.head()

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year,G-City,G-Country
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088,Bologna,Italy
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222,Padua,Italy


# Deleting things

In [79]:
udf.head()

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year,G-City,G-Country
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088,Bologna,Italy
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222,Padua,Italy


In [78]:
udf.drop(1)

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year,G-City,G-Country
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088,Bologna,Italy
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222,Padua,Italy
5,1224 (1258),University of Naples Federico II,Kingdom of Sicily,"Naples, Italy","The first public university,[20] founded by Fr...",1224,Naples,Italy
6,1240–1357,University of Siena,Republic of Siena,"Siena, Italy",Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,1290,University of Coimbra[23],Kingdom of Portugal,"Coimbra, Portugal",It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,1290,University of Macerata[23],Papal States,"Macerata, Italy","Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,1293,University of Valladolid,Crown of Castile,"Valladolid, Spain",Founded in the late 13th century (first docume...,1293,Valladolid,Spain
10,1293,Complutense University of Madrid,Crown of Castile,"Madrid, Spain",The University of Alcalá was founded by Sancho...,1293,Madrid,Spain


In [139]:
udf.head(3)

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year,G-City,G-Country
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088,Bologna,Italy
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134,Salamanca,Spain


In [80]:
udf.drop(columns=['G-Location', 'Year'])

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
0,University of Bologna,"Kingdom of Italy, Holy Roman Empire",The oldest university in the world. A universi...,1088,Bologna,Italy
1,University of Oxford,Kingdom of England,"Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,University of Salamanca,Kingdom of León,The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,University of Cambridge,Kingdom of England,Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,University of Padua,Medieval commune of Padua,Founded by scholars and professors after leavi...,1222,Padua,Italy
5,University of Naples Federico II,Kingdom of Sicily,"The first public university,[20] founded by Fr...",1224,Naples,Italy
6,University of Siena,Republic of Siena,Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,University of Coimbra[23],Kingdom of Portugal,It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,University of Macerata[23],Papal States,"Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,University of Valladolid,Crown of Castile,Founded in the late 13th century (first docume...,1293,Valladolid,Spain


Dataframe or series are not automatically "adjusted" except you use `inpace=True`

In [160]:
udf

Unnamed: 0,Year,University,H-Location,G-Location,Notes,year,G-City,G-Country
0,1088(charter granted 1158),University of Bologna,"Kingdom of Italy, Holy Roman Empire","Bologna, Italy",The oldest university in the world. A universi...,1088,Bologna,Italy
1,1096–1167(charter granted in 1248)[11],University of Oxford,Kingdom of England,"Oxford, United Kingdom","Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,1134 (charter granted in 1218),University of Salamanca,Kingdom of León,"Salamanca, Spain",The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,1209(charter granted in 1231)[16],University of Cambridge,Kingdom of England,"Cambridge, United Kingdom",Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,1222(probably older),University of Padua,Medieval commune of Padua,"Padua, Italy",Founded by scholars and professors after leavi...,1222,Padua,Italy
5,1224 (1258),University of Naples Federico II,Kingdom of Sicily,"Naples, Italy","The first public university,[20] founded by Fr...",1224,Naples,Italy
6,1240–1357,University of Siena,Republic of Siena,"Siena, Italy",Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,1290,University of Coimbra[23],Kingdom of Portugal,"Coimbra, Portugal",It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,1290,University of Macerata[23],Papal States,"Macerata, Italy","Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,1293,University of Valladolid,Crown of Castile,"Valladolid, Spain",Founded in the late 13th century (first docume...,1293,Valladolid,Spain


In [81]:
udf.drop(columns=['G-Location', 'Year'], inplace=True)

In [83]:
udf


Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
0,University of Bologna,"Kingdom of Italy, Holy Roman Empire",The oldest university in the world. A universi...,1088,Bologna,Italy
1,University of Oxford,Kingdom of England,"Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,University of Salamanca,Kingdom of León,The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,University of Cambridge,Kingdom of England,Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,University of Padua,Medieval commune of Padua,Founded by scholars and professors after leavi...,1222,Padua,Italy
5,University of Naples Federico II,Kingdom of Sicily,"The first public university,[20] founded by Fr...",1224,Naples,Italy
6,University of Siena,Republic of Siena,Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,University of Coimbra[23],Kingdom of Portugal,It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,University of Macerata[23],Papal States,"Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,University of Valladolid,Crown of Castile,Founded in the late 13th century (first docume...,1293,Valladolid,Spain


# slicing and dicing

In [84]:
udf[:3] # df[:'r3'] works as well

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
0,University of Bologna,"Kingdom of Italy, Holy Roman Empire",The oldest university in the world. A universi...,1088,Bologna,Italy
1,University of Oxford,Kingdom of England,"Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,University of Salamanca,Kingdom of León,The oldest university in the Hispanic world. T...,1134,Salamanca,Spain


In [85]:
# selecting one column!
udf['G-Country']

0               Italy
1      United Kingdom
2               Spain
3      United Kingdom
4               Italy
5               Italy
6               Italy
7            Portugal
8               Italy
9               Spain
10              Spain
11              Italy
12              Italy
13              Italy
14              Italy
15     Czech Republic
16              Italy
17             Poland
18            Austria
19            Germany
20              Italy
21              Italy
22            Germany
23     United Kingdom
24              Italy
25              Spain
26     United Kingdom
27            Germany
28            Germany
29        Switzerland
30            Germany
31            Germany
32             Sweden
33            Denmark
34              Italy
35     United Kingdom
36              Spain
37              Spain
Name: G-Country, dtype: object

In [86]:
# selecting one row
udf.loc[1]

University                                 University of Oxford
H-Location                                   Kingdom of England
Notes         Oxford claims its founding ("...teaching exist...
year                                                       1096
G-City                                                   Oxford
G-Country                                        United Kingdom
Name: 1, dtype: object

In [91]:
udf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   University  38 non-null     object
 1   H-Location  38 non-null     object
 2   Notes       36 non-null     object
 3   year        38 non-null     int64 
 4   G-City      38 non-null     object
 5   G-Country   38 non-null     object
dtypes: int64(1), object(5)
memory usage: 1.9+ KB


In [87]:
# mask also work on df!
mask = udf['year'] < 1400
mask.head(10)

TypeError: '<' not supported between instances of 'str' and 'int'

In [97]:
udf.year = udf.year.astype(int)

In [95]:
_udf = udf.convert_dtypes()

In [96]:
_udf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   University  38 non-null     string
 1   H-Location  38 non-null     string
 2   Notes       36 non-null     string
 3   year        38 non-null     string
 4   G-City      38 non-null     string
 5   G-Country   38 non-null     string
dtypes: string(6)
memory usage: 1.9 KB


In [98]:
# mask also work on df!
mask = udf.year < 1400
mask.head(10)

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
9    True
Name: year, dtype: bool

In [99]:
udf[mask]

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
0,University of Bologna,"Kingdom of Italy, Holy Roman Empire",The oldest university in the world. A universi...,1088,Bologna,Italy
1,University of Oxford,Kingdom of England,"Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,University of Salamanca,Kingdom of León,The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,University of Cambridge,Kingdom of England,Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,University of Padua,Medieval commune of Padua,Founded by scholars and professors after leavi...,1222,Padua,Italy
5,University of Naples Federico II,Kingdom of Sicily,"The first public university,[20] founded by Fr...",1224,Naples,Italy
6,University of Siena,Republic of Siena,Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,University of Coimbra[23],Kingdom of Portugal,It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,University of Macerata[23],Papal States,"Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,University of Valladolid,Crown of Castile,Founded in the late 13th century (first docume...,1293,Valladolid,Spain


In [100]:
udf[udf['year'] < 1300] # reduces the data frame, again note! that is just a view, not a copy!

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
0,University of Bologna,"Kingdom of Italy, Holy Roman Empire",The oldest university in the world. A universi...,1088,Bologna,Italy
1,University of Oxford,Kingdom of England,"Oxford claims its founding (""...teaching exist...",1096,Oxford,United Kingdom
2,University of Salamanca,Kingdom of León,The oldest university in the Hispanic world. T...,1134,Salamanca,Spain
3,University of Cambridge,Kingdom of England,Founded by scholars leaving Oxford after a dis...,1209,Cambridge,United Kingdom
4,University of Padua,Medieval commune of Padua,Founded by scholars and professors after leavi...,1222,Padua,Italy
5,University of Naples Federico II,Kingdom of Sicily,"The first public university,[20] founded by Fr...",1224,Naples,Italy
6,University of Siena,Republic of Siena,Originally founded in 1240 by the Commune of S...,1240,Siena,Italy
7,University of Coimbra[23],Kingdom of Portugal,It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
8,University of Macerata[23],Papal States,"Founded in 1290, possibly as a private law sch...",1290,Macerata,Italy
9,University of Valladolid,Crown of Castile,Founded in the late 13th century (first docume...,1293,Valladolid,Spain


In [101]:
udf[udf['year'] < 1300].loc[1]

University                                 University of Oxford
H-Location                                   Kingdom of England
Notes         Oxford claims its founding ("...teaching exist...
year                                                       1096
G-City                                                   Oxford
G-Country                                        United Kingdom
Name: 1, dtype: object

In [102]:
udf[udf['year'] > 1300].loc[1]

KeyError: 1

In [103]:
udf[udf['year'] > 1300].head(3)

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
11,Sapienza University of Rome,Papal States,"Founded by Pope Boniface VIII, but became a st...",1303,Rome,Italy
12,University of Perugia,Papal States,Attested by the Bull of Pope Clement V.,1308,Perugia,Italy
13,University of Florence,Republic of Florence,The University of Florence evolved from the St...,1321,Florence,Italy


In [104]:
udf[udf['year'] > 1300].iloc[1]

University                      University of Perugia
H-Location                               Papal States
Notes         Attested by the Bull of Pope Clement V.
year                                             1308
G-City                                        Perugia
G-Country                                       Italy
Name: 12, dtype: object

## more natural query - or isn't it?

In [105]:
udf.query("year > 1300").head(5)

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
11,Sapienza University of Rome,Papal States,"Founded by Pope Boniface VIII, but became a st...",1303,Rome,Italy
12,University of Perugia,Papal States,Attested by the Bull of Pope Clement V.,1308,Perugia,Italy
13,University of Florence,Republic of Florence,The University of Florence evolved from the St...,1321,Florence,Italy
14,University of Pisa,Republic of Pisa,It was formally founded on 3 September 1343 by...,1343,Pisa,Italy
15,Charles University,"Kingdom of Bohemia, Holy Roman Empire",Three of four faculties closed in 1419. Merged...,1348,Prague,Czech Republic


In [106]:
udf.query("1349 > year > 1320")

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
13,University of Florence,Republic of Florence,The University of Florence evolved from the St...,1321,Florence,Italy
14,University of Pisa,Republic of Pisa,It was formally founded on 3 September 1343 by...,1343,Pisa,Italy
15,Charles University,"Kingdom of Bohemia, Holy Roman Empire",Three of four faculties closed in 1419. Merged...,1348,Prague,Czech Republic


In [108]:
# Using local variables in queries
upper_limit = 1400
udf.query("@upper_limit > year > 1320")

Unnamed: 0,University,H-Location,Notes,year,G-City,G-Country
13,University of Florence,Republic of Florence,The University of Florence evolved from the St...,1321,Florence,Italy
14,University of Pisa,Republic of Pisa,It was formally founded on 3 September 1343 by...,1343,Pisa,Italy
15,Charles University,"Kingdom of Bohemia, Holy Roman Empire",Three of four faculties closed in 1419. Merged...,1348,Prague,Czech Republic
16,University of Pavia,Domain of the House of Visconti,Closed for short periods during the Italian Wa...,1361,Pavia,Italy
17,Jagiellonian University,Kingdom of Poland,Founded by Casimir the Great under the name St...,1364,Kraków,Poland
18,University of Vienna,Holy Roman Empire,Modelled on the University of Paris.,1365,Vienna,Austria
19,Ruprecht Karl University of Heidelberg,Holy Roman Empire,"Founded by Rupert I, Elector Palatine. The old...",1386,Heidelberg,Germany
20,University of Ferrara,House of Este,Founded by Marquis Alberto d'Este.,1391,Ferrara,Italy


## Find the maximum for a given series or dataframe

In [109]:
udf['year'].idxmax()

37

## Unique values and their count

In [110]:
udf['G-Country'].unique()

array([' Italy', ' United Kingdom', ' Spain', ' Portugal',
       ' Czech Republic', ' Poland', ' Austria', ' Germany',
       ' Switzerland', ' Sweden', ' Denmark'], dtype=object)

In [111]:
udf['G-Country'].nunique()

11

In [112]:
udf['G-Country'].value_counts()

 Italy             14
 Spain              6
 Germany            6
 United Kingdom     5
 Czech Republic     1
 Portugal           1
 Sweden             1
 Switzerland        1
 Denmark            1
 Austria            1
 Poland             1
Name: G-Country, dtype: int64

In [114]:
_udf = udf.set_index('University')

In [115]:
_udf.sample(5)

Unnamed: 0_level_0,H-Location,Notes,year,G-City,G-Country
University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
University of Coimbra[23],Kingdom of Portugal,It began its existence in Lisbon with the name...,1290,Coimbra,Portugal
University of Florence,Republic of Florence,The University of Florence evolved from the St...,1321,Florence,Italy
Uppsala University,Kingdom of Sweden within the Kalmar Union,"Uppsala's bull, which granted the university i...",1477,Uppsala,Sweden
University of Santiago de Compostela,"Kingdom of Galicia, Crown of Castile","The university traces its roots to 1495, when ...",1495,Santiago de Compostela,Spain
Ruprecht Karl University of Heidelberg,Holy Roman Empire,"Founded by Rupert I, Elector Palatine. The old...",1386,Heidelberg,Germany


In [123]:
_udf.loc['University of Florence', ['Notes', 'year']]

Notes    The University of Florence evolved from the St...
year                                                  1321
Name: University of Florence, dtype: object

In [124]:
_udf.loc['University of Florence', :]

H-Location                                 Republic of Florence
Notes         The University of Florence evolved from the St...
year                                                       1321
G-City                                                 Florence
G-Country                                                 Italy
Name: University of Florence, dtype: object

## Done with Basics!
Take a look at the cheat sheet for a summary
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

# Hierarchical indexing

In [199]:
s = pd.Series(
        np.random.randn(5), 
        index = [
            ['p1','p1','p2','p2','p3'],
            ['a','b','a','d','a']
        ]
)
s

p1  a   -0.378161
    b   -0.579655
p2  a   -0.171922
    d   -2.058234
p3  a    1.650458
dtype: float64

In [200]:
s.index

MultiIndex([('p1', 'a'),
            ('p1', 'b'),
            ('p2', 'a'),
            ('p2', 'd'),
            ('p3', 'a')],
           )

In [201]:
s.index.names = ['probability', 'type']

In [202]:
s

probability  type
p1           a      -0.378161
             b      -0.579655
p2           a      -0.171922
             d      -2.058234
p3           a       1.650458
dtype: float64

In [203]:
s['p1']

type
a   -0.378161
b   -0.579655
dtype: float64

In [204]:
s[:, 'a'] # lower level 

probability
p1   -0.378161
p2   -0.171922
p3    1.650458
dtype: float64

In [205]:
s2 = s.unstack()
print(type(s2))
s2

<class 'pandas.core.frame.DataFrame'>


type,a,b,d
probability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
p1,-0.378161,-0.579655,
p2,-0.171922,,-2.058234
p3,1.650458,,


In [206]:
s3 = s2.stack()
print(type(s3))
s3

<class 'pandas.core.series.Series'>


probability  type
p1           a      -0.378161
             b      -0.579655
p2           a      -0.171922
             d      -2.058234
p3           a       1.650458
dtype: float64

## Multindex with Dataframes

In [207]:
df = pd.DataFrame(
    [
        c, 
        c * 20, 
        d,
        np.exp(d),
        pd.Series(np.random.randn(4), index=['r2', 'r3', 'r4', 'r5'])
    ],
    index = [
        ['p1','p1','p2','p2','p3'],
        ['a','b','a','d','a']
    ]
)
df.index.names = ['probability', 'type']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
probability,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
p1,a,-0.100742,0.257149,-1.845719,2.738763,
p1,b,-2.01484,5.142975,-36.91438,54.775253,
p2,a,0.559196,1.095669,0.304911,0.710247,
p2,d,1.749266,2.991184,1.356505,2.034493,
p3,a,,0.128138,0.310731,1.687249,-0.16519


In [208]:
df = df.fillna(0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
probability,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
p1,a,-0.100742,0.257149,-1.845719,2.738763,0.0
p1,b,-2.01484,5.142975,-36.91438,54.775253,0.0
p2,a,0.559196,1.095669,0.304911,0.710247,0.0
p2,d,1.749266,2.991184,1.356505,2.034493,0.0
p3,a,0.0,0.128138,0.310731,1.687249,-0.16519


**Note**:
You can create multi indeces from a regular dataframe!

In [209]:
df2 = df.reset_index()

In [210]:
df2

Unnamed: 0,probability,type,r1,r2,r3,r4,r5
0,p1,a,-0.100742,0.257149,-1.845719,2.738763,0.0
1,p1,b,-2.01484,5.142975,-36.91438,54.775253,0.0
2,p2,a,0.559196,1.095669,0.304911,0.710247,0.0
3,p2,d,1.749266,2.991184,1.356505,2.034493,0.0
4,p3,a,0.0,0.128138,0.310731,1.687249,-0.16519


In [211]:
df2.set_index(['probability', 'type'])

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
probability,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
p1,a,-0.100742,0.257149,-1.845719,2.738763,0.0
p1,b,-2.01484,5.142975,-36.91438,54.775253,0.0
p2,a,0.559196,1.095669,0.304911,0.710247,0.0
p2,d,1.749266,2.991184,1.356505,2.034493,0.0
p3,a,0.0,0.128138,0.310731,1.687249,-0.16519


In [212]:
df2 = df.swaplevel('probability', 'type')
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
type,probability,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,p1,-0.100742,0.257149,-1.845719,2.738763,0.0
b,p1,-2.01484,5.142975,-36.91438,54.775253,0.0
a,p2,0.559196,1.095669,0.304911,0.710247,0.0
d,p2,1.749266,2.991184,1.356505,2.034493,0.0
a,p3,0.0,0.128138,0.310731,1.687249,-0.16519


In [213]:
df2.sort_index(axis=0, level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
type,probability,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,p1,-0.100742,0.257149,-1.845719,2.738763,0.0
a,p2,0.559196,1.095669,0.304911,0.710247,0.0
a,p3,0.0,0.128138,0.310731,1.687249,-0.16519
b,p1,-2.01484,5.142975,-36.91438,54.775253,0.0
d,p2,1.749266,2.991184,1.356505,2.034493,0.0


## Natural slicing using `pandas.IndexSlice`  objects

In [214]:
idx = pd.IndexSlice
df2.loc[idx[:, ["p1", "p2"]], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,r1,r2,r3,r4,r5
type,probability,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,p1,-0.100742,0.257149,-1.845719,2.738763,0.0
b,p1,-2.01484,5.142975,-36.91438,54.775253,0.0
a,p2,0.559196,1.095669,0.304911,0.710247,0.0
d,p2,1.749266,2.991184,1.356505,2.034493,0.0


## long and wide formats

In [215]:
df3 = df2.reset_index()
df3

Unnamed: 0,type,probability,r1,r2,r3,r4,r5
0,a,p1,-0.100742,0.257149,-1.845719,2.738763,0.0
1,b,p1,-2.01484,5.142975,-36.91438,54.775253,0.0
2,a,p2,0.559196,1.095669,0.304911,0.710247,0.0
3,d,p2,1.749266,2.991184,1.356505,2.034493,0.0
4,a,p3,0.0,0.128138,0.310731,1.687249,-0.16519


In [216]:
df4 = df3.melt(
    id_vars=['type','probability'],
    var_name='r_stage',
    value_name='score'
)
print(df4.shape)
df4.head()

(25, 4)


Unnamed: 0,type,probability,r_stage,score
0,a,p1,r1,-0.100742
1,b,p1,r1,-2.01484
2,a,p2,r1,0.559196
3,d,p2,r1,1.749266
4,a,p3,r1,0.0


In [217]:
df5 = df4.pivot_table(index=['type', 'probability'], columns='r_stage', values="score")
df5 

Unnamed: 0_level_0,r_stage,r1,r2,r3,r4,r5
type,probability,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,p1,-0.100742,0.257149,-1.845719,2.738763,0.0
a,p2,0.559196,1.095669,0.304911,0.710247,0.0
a,p3,0.0,0.128138,0.310731,1.687249,-0.16519
b,p1,-2.01484,5.142975,-36.91438,54.775253,0.0
d,p2,1.749266,2.991184,1.356505,2.034493,0.0
