In [1]:
import pandas as pd
import numpy as np

gen = np.random.default_rng(42)

# Hierarichal Indexing
data = pd.Series(gen.uniform(size=9),
                index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], 
                        [1, 2, 3, 1, 2, 1, 2, 1, 2]])
print(data)

a  1    0.773956
   2    0.438878
   3    0.858598
b  1    0.697368
   2    0.094177
c  1    0.975622
   2    0.761140
d  1    0.786064
   2    0.128114
dtype: float64


In [4]:
# Partial Indexing - index to get a lower dimension
# Get the dataframe in a
print(data['a'])

# Get the dataframes in a to c
print(data['a':'c'])

# To convert this series to a dataframe use unstack method
data_df = data.unstack()
print(data_df)

# Convert dataframe to such a series using stack method

1    0.773956
2    0.438878
3    0.858598
dtype: float64
a  1    0.773956
   2    0.438878
   3    0.858598
b  1    0.697368
   2    0.094177
c  1    0.975622
   2    0.761140
dtype: float64
          1         2         3
a  0.773956  0.438878  0.858598
b  0.697368  0.094177       NaN
c  0.975622  0.761140       NaN
d  0.786064  0.128114       NaN


In [7]:
# Hierarichal indexes with dataframes
frame = pd.DataFrame(np.arange(12).reshape(4, 3), 
                    index=[['a', 'a', 'b', 'b'], # top index
                            [1, 2, 1, 2]], # lower index
                    columns=[['Ohio', 'Ohio', 'Colorado'], # top index
                            ['Green', 'Red', 'Green']]) # lower index
print(frame)

# To name the top and lower index of each axis:
frame.index.names = ['key1', 'key2'] # [top, lower]
frame.columns.names = ['state', 'color']
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


In [9]:
# Partial indexing in dataframe
# Get the dataframe in Ohio column
print(frame.loc[:, 'Ohio'])

# Get the dataframe in row a
print(frame.loc['a'])

color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     2         9   10
state  Ohio     Colorado
color Green Red    Green
key2                    
1         0   1        2
2         3   4        5


In [16]:
# JOINING DATAFRAMES IN PANDAS

# To performa an SQL join on dataframes we use the pd.merge() function

df1 = pd.DataFrame({
    'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data1': pd.Series(range(7), dtype='Int64')
})
df2 = pd.DataFrame({
    'key': ['a', 'b', 'd', 'b'],
    'data2': pd.Series(range(4), dtype='Int64')
})

# pd.merge with no arguments
# print(pd.merge(df1, df2)) # Did the join on the common column and selected only common values in that column

# If you want to specify the column to join on add on parameter
print(pd.merge(df1, df2, on='key'))

# If you want to join on columns that have different column names specify left_on and right_on
df3 = pd.DataFrame({
    'lkey': list('bbacaab'),
    'data1': pd.Series(range(7), dtype='Int64')
})

df4 = pd.DataFrame({
    'rkey': list('abd'),
    'data2': pd.Series(range(3), dtype='Int64')
})
print(pd.merge(df3, df4, left_on='lkey', right_on='rkey'))

  key  data1  data2
0   b      0      1
1   b      0      3
2   b      1      1
3   b      1      3
4   b      6      1
5   b      6      3
6   a      2      0
7   a      4      0
8   a      5      0
  lkey  data1 rkey  data2
0    b      0    b      1
1    b      1    b      1
2    b      6    b      1
3    a      2    a      0
4    a      4    a      0
5    a      5    a      0


<h2>Concatenation</h2>
<p>Allows you to stack tables along an axis</p>