In [2]:
import pandas as pd
import numpy as np

gen = np.random.default_rng(42)

# Hierarichal Indexing
data = pd.Series(gen.uniform(size=9),
                index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], 
                        [1, 2, 3, 1, 2, 1, 2, 1, 2]])
print(data)

a  1    0.773956
   2    0.438878
   3    0.858598
b  1    0.697368
   2    0.094177
c  1    0.975622
   2    0.761140
d  1    0.786064
   2    0.128114
dtype: float64


In [3]:
# Partial Indexing - index to get a lower dimension
# Get the dataframe in a
print(data['a'])

# Get the dataframes in a to c
print(data['a':'c'])

# To convert this series to a dataframe use unstack method
data_df = data.unstack()
print(data_df)

# Convert dataframe to such a series using stack method

1    0.773956
2    0.438878
3    0.858598
dtype: float64
a  1    0.773956
   2    0.438878
   3    0.858598
b  1    0.697368
   2    0.094177
c  1    0.975622
   2    0.761140
dtype: float64
          1         2         3
a  0.773956  0.438878  0.858598
b  0.697368  0.094177       NaN
c  0.975622  0.761140       NaN
d  0.786064  0.128114       NaN


In [4]:
# Hierarichal indexes with dataframes
frame = pd.DataFrame(np.arange(12).reshape(4, 3), 
                    index=[['a', 'a', 'b', 'b'], # top index
                            [1, 2, 1, 2]], # lower index
                    columns=[['Ohio', 'Ohio', 'Colorado'], # top index
                            ['Green', 'Red', 'Green']]) # lower index
print(frame)

# To name the top and lower index of each axis:
frame.index.names = ['key1', 'key2'] # [top, lower]
frame.columns.names = ['state', 'color']
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


In [5]:
# Partial indexing in dataframe
# Get the dataframe in Ohio column
print(frame.loc[:, 'Ohio'])

# Get the dataframe in row a
print(frame.loc['a'])

color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     2         9   10
state  Ohio     Colorado
color Green Red    Green
key2                    
1         0   1        2
2         3   4        5


In [6]:
# JOINING DATAFRAMES IN PANDAS

# To performa an SQL join on dataframes we use the pd.merge() function

df1 = pd.DataFrame({
    'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data1': pd.Series(range(7), dtype='Int64')
})
df2 = pd.DataFrame({
    'key': ['a', 'b', 'd', 'b'],
    'data2': pd.Series(range(4), dtype='Int64')
})

# pd.merge with no arguments
# print(pd.merge(df1, df2)) # Did the join on the common column and selected only common values in that column

# If you want to specify the column to join on add on parameter
print(pd.merge(df1, df2, on='key'))

# If you want to join on columns that have different column names specify left_on and right_on
df3 = pd.DataFrame({
    'lkey': list('bbacaab'),
    'data1': pd.Series(range(7), dtype='Int64')
})

df4 = pd.DataFrame({
    'rkey': list('abd'),
    'data2': pd.Series(range(3), dtype='Int64')
})
print(pd.merge(df3, df4, left_on='lkey', right_on='rkey'))

  key  data1  data2
0   b      0      1
1   b      0      3
2   b      1      1
3   b      1      3
4   b      6      1
5   b      6      3
6   a      2      0
7   a      4      0
8   a      5      0
  lkey  data1 rkey  data2
0    b      0    b      1
1    b      1    b      1
2    b      6    b      1
3    a      2    a      0
4    a      4    a      0
5    a      5    a      0


<h2>Concatenation</h2>
<p>Allows you to stack tables along an axis</p>

In [7]:
# Concatenation - sticking items along an axis
arr = np.arange(12).reshape((3, 4))
print(arr)

arr2 = np.concatenate([arr, arr], axis=1)
print(arr2)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]


In [10]:
# Concatenating labelled data
# Dealing with data that has non overlapping labels in axis
s1 = pd.Series([0, 1], index=['a', 'b'], dtype='Int64')
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'], dtype='Int64')
s3 = pd.Series([5, 6], index=['f', 'g'], dtype='Int64')

# Concatenate along rows
s4 = pd.concat([s1, s2, s3])
print(s4)

# Concatenate along columns
s5 = pd.concat([s1, s2, s3], axis=1, join='inner')
print(s5)

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64
Empty DataFrame
Columns: [0, 1, 2]
Index: []


In [12]:
# Dealing with identifying combined parts
s6 = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
print(s6)

# Concatenating dataframes
df1 = pd.DataFrame(np.arange(6).reshape((3, 2)),
                    index=list('abc'), columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape((2, 2)),
                    index=list('ac'), columns=['three', 'four'])
df3 = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
print(df3)                    

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: Int64
  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0


In [None]:
# Using combine_first to combine series based on index labels
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
                index=list('fedcba'), dtype='float64')
b = pd.Series([0, np.nan, 2.0, np.nan, np.nan, 5.0],
                index=list('abcdef'), dtype='float64')
c = a.combine_first(b)
print(c)

In [14]:
# Reshaping data using stack and unstack

# stack - makes columns join row index to form heirarichal index
data = pd.DataFrame(np.arange(6).reshape(2, 3), 
                    index=pd.Index(['one', 'two'], name='position'),
                    columns=pd.Index(['a', 'b', 'c'], name='rank'))
print(data)
print('Stacked version is:')
data2 = data.stack()
print(data2)   

rank      a  b  c
position         
one       0  1  2
two       3  4  5
Stacked version is:
position  rank
one       a       0
          b       1
          c       2
two       a       3
          b       4
          c       5
dtype: int64


In [17]:
# unstack - takes a hierarichal indexed Series and makes it a dataframe by making one of the indexes in the Multiindex into 
# a column
data3 = data2.unstack(level=0)
print('Data2 unstack on level 0 (same as unstack(level=position)):')
print(data3)

data4 = data2.unstack(level=1)
print('Data2 unstack on level 1 (same as unstack on level=rank):')
print(data4)

Data2 unstack on level 0 (same as unstack(level=position)):
position  one  two
rank              
a           0    3
b           1    4
c           2    5
Data2 unstack on level 1 (same as unstack on level=rank):
rank      a  b  c
position         
one       0  1  2
two       3  4  5


In [26]:
# Unstacking a dataframe with a hierarichal index
df1 = pd.DataFrame(np.concatenate([np.arange(6).reshape(6, 1), np.arange(5, 11).reshape(6, 1)], axis=1),
                index=pd.MultiIndex.from_arrays([list('aabbcc'), [0, 1, 0, 1, 0, 1]], names=['rank', 'position']),
                columns=pd.Index(['left', 'right'], name='side'))
print(df1)
print('df1 after unstacking on level rank')
df2 = df1.unstack(level='rank')
print(df2)
print('df2 after stacking on level=rank')
df3 = df2.stack(level='rank')
print(df3)

side           left  right
rank position             
a    0            0      5
     1            1      6
b    0            2      7
     1            3      8
c    0            4      9
     1            5     10
df1 after unstacking on level rank
side     left       right       
rank        a  b  c     a  b   c
position                        
0           0  2  4     5  7   9
1           1  3  5     6  8  10
df2 after stacking on level=rank
side           left  right
position rank             
0        a        0      5
         b        2      7
         c        4      9
1        a        1      6
         b        3      8
         c        5     10


In [35]:
# Pivoting long to wide format
timedf = pd.read_csv('pydata-book/examples/macrodata.csv')
timedf2 = timedf.reindex(columns=['year', 'quarter', 'realgdp', 'infl', 'unemp'])
print(timedf2.head())

# Combine both year and quarter columns to make index consist of datetime values
periods = pd.PeriodIndex(year=timedf2.pop('year'),
                        quarter=timedf2.pop('quarter'),
                        name='date')
print(periods)
timedf2.index = periods.to_timestamp('D')
timedf2.columns.name = 'item'
print(timedf2.head())
ldata = timedf2.stack().reset_index().rename(columns={0:'value'})
print(ldata.head())

   year  quarter   realgdp  infl  unemp
0  1959        1  2710.349  0.00    5.8
1  1959        2  2778.801  2.34    5.1
2  1959        3  2775.488  2.74    5.3
3  1959        4  2785.204  0.27    5.6
4  1960        1  2847.699  2.31    5.2
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203)
item         realgdp  infl  unemp
date                             
1959-01-01  2710.349  0.00    5.8
1959-04-01  2778.801  2.34    5.1
1959-07-01  2775.488  2.74    5.3
1959-10-01  2785.204  0.27    5.6
1960-01-01  2847.699  2.31    5.2
        date     item     value
0 1959-01-01  realgdp  2710.349
1 1959-01-01     infl     0.000
2 1959-01-01    unemp     5.800
3 1959-04-01  realgdp  2778.801
4 1959-04-01     infl     2.340


In [36]:
# Pivoting - convert long format to wide format
wdata = ldata.pivot(index='date', columns='item', values='value')
print('ldata conveted to wide format is:')
print(wdata)

ldata conveted to wide format is:
item        infl    realgdp  unemp
date                              
1959-01-01  0.00   2710.349    5.8
1959-04-01  2.34   2778.801    5.1
1959-07-01  2.74   2775.488    5.3
1959-10-01  0.27   2785.204    5.6
1960-01-01  2.31   2847.699    5.2
...          ...        ...    ...
2008-07-01 -3.16  13324.600    6.0
2008-10-01 -8.79  13141.920    6.9
2009-01-01  0.94  12925.410    8.1
2009-04-01  3.37  12901.504    9.2
2009-07-01  3.56  12990.341    9.6

[203 rows x 3 columns]


In [37]:
# Changing from wide fomrat to long format
wdf = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                    'A': [1, 2, 3],
                    'B': [5, 6, 7]})
print('Wide Format is: ')
print(wdf)
print('Long Format is: ')
ldf = pd.melt(wdf, id_vars='key')
print(ldf)

Wide Format is: 
   key  A  B
0  foo  1  5
1  bar  2  6
2  baz  3  7
Long Format is: 
   key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      5
4  bar        B      6
5  baz        B      7
