In [9]:
import numpy as np
import pandas as pd

In [10]:
from numpy.random import randn

In [3]:
np.random.seed(101) # seed randn gen to match tutorial vid!

In [60]:
dataf = pd.DataFrame(randn(3, 3), ['A','B','C'], ['One', 'Two', 'Three'])
dataf

Unnamed: 0,One,Two,Three
A,1.482495,0.961458,-2.141212
B,0.992573,1.192241,-1.04678
C,1.292765,-1.467514,-0.494095


In [61]:
# index cols using a list of labels
# can't index rows...
dataf[['One', 'Two', 'Three']]

Unnamed: 0,One,Two,Three
A,1.482495,0.961458,-2.141212
B,0.992573,1.192241,-1.04678
C,1.292765,-1.467514,-0.494095


In [62]:
# add cols like how u would add to dict; reference as if it existed
dataf['ooh'] = randn(3, 1)
dataf['new'] = randn(3, 1)
dataf

Unnamed: 0,One,Two,Three,ooh,new
A,1.482495,0.961458,-2.141212,-0.162535,0.221491
B,0.992573,1.192241,-1.04678,0.485809,-0.855196
C,1.292765,-1.467514,-0.494095,0.392489,1.54199


In [63]:
# remove cols by using .drop and axis param (axis works like in np!)
# NOTE .drop returns copy not inplace, we must specify whether we want it inplace or no
dataf.drop('ooh', axis=1, inplace=True)
dataf.drop('new', axis=1, inplace=True)
dataf

Unnamed: 0,One,Two,Three
A,1.482495,0.961458,-2.141212
B,0.992573,1.192241,-1.04678
C,1.292765,-1.467514,-0.494095


In [64]:
# rm rows w axis 0
dataf.drop('B', inplace=True)
dataf

Unnamed: 0,One,Two,Three
A,1.482495,0.961458,-2.141212
C,1.292765,-1.467514,-0.494095


In [65]:
# index rows with .loc
dataf.loc['A']

One      1.482495
Two      0.961458
Three   -2.141212
Name: A, dtype: float64

In [None]:
# it returns a series!

In [66]:
# we can also index by number of row even if labels aren't numbers w/iloc
dataf.iloc[0]

One      1.482495
Two      0.961458
Three   -2.141212
Name: A, dtype: float64

In [69]:
# loc also serves to find subsets
dataf.loc[['A','C'],['Two','One']] # pass in list of desired rows, then desired cols!

Unnamed: 0,Two,One
A,0.961458,1.482495
C,-1.467514,1.292765


In [70]:
# can use conditional selection on dfs and works like numpy array
dataf > 0.5

Unnamed: 0,One,Two,Three
A,True,True,False
C,True,False,False


In [71]:
# indexing also works same, but missing vals are replaced by nans and not excluded
dataf[dataf > 0.5]

Unnamed: 0,One,Two,Three
A,1.482495,0.961458,
C,1.292765,,


In [74]:
# works with series-returning col-indexing as well
dataf['One'] > 3

A    False
C    False
Name: One, dtype: bool

In [77]:
dataf[dataf['One'] > 3] # this returns rows! here, it'll return the rows where the value in column `One` follows the condition

Unnamed: 0,One,Two,Three


In [79]:
# need to use bitwise conditional notation instead of normal python ops to chain!
dataf[(dataf['One'] > 0.2) & (dataf['Three'] > 0.3)]

Unnamed: 0,One,Two,Three


In [80]:
# to reset row indices from whatever they are to number defaults, do
dataf.reset_index()

Unnamed: 0,index,One,Two,Three
0,A,1.482495,0.961458,-2.141212
1,C,1.292765,-1.467514,-0.494095


In [89]:
# it will turn prev indices into a column; we also need to specify inplaceness

# we can also set index to an existing column like this
newind = 'Mari Ani'.split()
dataf['newind'] = newind
dataf

Unnamed: 0_level_0,One,Two,Three,newind
newind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mari,1.482495,0.961458,-2.141212,Mari
Ani,1.292765,-1.467514,-0.494095,Ani


In [90]:
dataf.set_index('newind', inplace=True)
dataf

Unnamed: 0_level_0,One,Two,Three
newind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mari,1.482495,0.961458,-2.141212
Ani,1.292765,-1.467514,-0.494095


In [92]:
# blank row is just a column label for the index!

In [11]:
outside = (['G1'] * 3) + (['G2'] * 3)
inside = [1, 2, 3] * 2

# index levels

hier_index = list(zip(outside, inside)) # zip into pairs

In [20]:
hier_index = pd.MultiIndex.from_tuples(hier_index) # turn into pandas multi-index
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [25]:
df = pd.DataFrame(data=randn(6, 2), index=hier_index, columns=['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.169428,-0.316986
G1,2,0.795312,0.708911
G1,3,-1.530902,0.456714
G2,1,-2.513599,1.426188
G2,2,0.259005,0.790414
G2,3,0.293964,-0.349003


In [26]:
# KEY TAKEAWAY: they basically allow us to group rows
# here's how to index it

df.loc['G1'].loc[1] # gives back row as series

# KEY TAKEAWAY: method chain, NOT doublebrack!

A   -0.169428
B   -0.316986
Name: 1, dtype: float64

In [29]:
# we can name the group of indices and the groups themselves
df.index.names = ['Groups', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.169428,-0.316986
G1,2,0.795312,0.708911
G1,3,-1.530902,0.456714
G2,1,-2.513599,1.426188
G2,2,0.259005,0.790414
G2,3,0.293964,-0.349003


In [36]:
# testing grab
df.loc['G2'].loc[2].loc['B']

0.7904141293796607

In [40]:
# get everything under g1
df.loc['G1']
df.xs('G1') # xs for crossection; does same as above

# but it also can index stuff from diff groups at same time
df.xs(1, level='Num') # grab all 1 rows from the level 'Num'

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.169428,-0.316986
G2,-2.513599,1.426188
