# Pandas

### 1. Indexing

In [4]:
import pandas as pd
import numpy as np

In [5]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4),
    index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307
2000-01-02,-0.708817,0.92418,0.126232,-2.099197
2000-01-03,0.242145,-0.300782,0.023417,-1.681101
2000-01-04,0.854024,0.742149,-1.434302,0.371415
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959
2000-01-06,-0.217082,-0.267047,0.656395,-0.41146
2000-01-07,2.81364,-0.2369,-0.089739,1.42046
2000-01-08,0.703742,0.078719,0.472809,0.344106


In [6]:
s=df['A']
s[dates[5]]

-0.21708175315663325

#### swapping

In [8]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.241313,-0.247783,-0.103228,1.22307
2000-01-02,0.92418,-0.708817,0.126232,-2.099197
2000-01-03,-0.300782,0.242145,0.023417,-1.681101
2000-01-04,0.742149,0.854024,-1.434302,0.371415
2000-01-05,-0.264851,-1.398726,0.624474,-1.448959
2000-01-06,-0.267047,-0.217082,0.656395,-0.41146
2000-01-07,-0.2369,2.81364,-0.089739,1.42046
2000-01-08,0.078719,0.703742,0.472809,0.344106


In [9]:
#pandas aligns all AXES when setting Series and DataFrame from .loc, and .iloc.
#This will not modify df because the column alignment is before value assignment
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.241313,-0.247783,-0.103228,1.22307
2000-01-02,0.92418,-0.708817,0.126232,-2.099197
2000-01-03,-0.300782,0.242145,0.023417,-1.681101
2000-01-04,0.742149,0.854024,-1.434302,0.371415
2000-01-05,-0.264851,-1.398726,0.624474,-1.448959
2000-01-06,-0.267047,-0.217082,0.656395,-0.41146
2000-01-07,-0.2369,2.81364,-0.089739,1.42046
2000-01-08,0.078719,0.703742,0.472809,0.344106


In [10]:
#The correct way to swap column values is by using raw values:
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307
2000-01-02,-0.708817,0.92418,0.126232,-2.099197
2000-01-03,0.242145,-0.300782,0.023417,-1.681101
2000-01-04,0.854024,0.742149,-1.434302,0.371415
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959
2000-01-06,-0.217082,-0.267047,0.656395,-0.41146
2000-01-07,2.81364,-0.2369,-0.089739,1.42046
2000-01-08,0.703742,0.078719,0.472809,0.344106


#### Attribute accesss

In [11]:
df.A

2000-01-01   -0.247783
2000-01-02   -0.708817
2000-01-03    0.242145
2000-01-04    0.854024
2000-01-05   -1.398726
2000-01-06   -0.217082
2000-01-07    2.813640
2000-01-08    0.703742
Freq: D, Name: A, dtype: float64

In [12]:
df['E']=list(range(len(df.index)))
df

Unnamed: 0,A,B,C,D,E
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307,0
2000-01-02,-0.708817,0.92418,0.126232,-2.099197,1
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-04,0.854024,0.742149,-1.434302,0.371415,3
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959,4
2000-01-06,-0.217082,-0.267047,0.656395,-0.41146,5
2000-01-07,2.81364,-0.2369,-0.089739,1.42046,6
2000-01-08,0.703742,0.078719,0.472809,0.344106,7


#### Slicing ranges

In [13]:
df[::2]

Unnamed: 0,A,B,C,D,E
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307,0
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959,4
2000-01-07,2.81364,-0.2369,-0.089739,1.42046,6


In [14]:
df[::-1]

Unnamed: 0,A,B,C,D,E
2000-01-08,0.703742,0.078719,0.472809,0.344106,7
2000-01-07,2.81364,-0.2369,-0.089739,1.42046,6
2000-01-06,-0.217082,-0.267047,0.656395,-0.41146,5
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959,4
2000-01-04,0.854024,0.742149,-1.434302,0.371415,3
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-02,-0.708817,0.92418,0.126232,-2.099197,1
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307,0


In [20]:
df[:4]

Unnamed: 0,A,B,C,D,E
2000-01-01,-0.247783,-0.241313,-0.103228,1.22307,0
2000-01-02,-0.708817,0.92418,0.126232,-2.099197,1
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-04,0.854024,0.742149,-1.434302,0.371415,3


In [21]:
df[2:4]

Unnamed: 0,A,B,C,D,E
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-04,0.854024,0.742149,-1.434302,0.371415,3


In [28]:
df.iloc[2:4,:2]

Unnamed: 0,A,B
2000-01-03,0.242145,-0.300782
2000-01-04,0.854024,0.742149


In [30]:
df.loc['20000103':'20000106']

Unnamed: 0,A,B,C,D,E
2000-01-03,0.242145,-0.300782,0.023417,-1.681101,2
2000-01-04,0.854024,0.742149,-1.434302,0.371415,3
2000-01-05,-1.398726,-0.264851,0.624474,-1.448959,4
2000-01-06,-0.217082,-0.267047,0.656395,-0.41146,5


In [37]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'), columns=list('ABCD'))
df1.loc[['a','d','f'],:]

Unnamed: 0,A,B,C,D
a,1.241567,-0.501064,0.239784,-0.602825
d,1.337925,-0.542097,-0.304777,-1.697849
f,0.255263,2.252035,1.663554,0.283796


In [38]:
df1.loc['d':, 'A':'C']

Unnamed: 0,A,B,C
d,1.337925,-0.542097,-0.304777
e,1.536916,1.338247,-2.857636
f,0.255263,2.252035,1.663554


In [39]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,C
a,1.241567,0.239784
b,0.67815,0.151768
c,0.251065,-1.273973
d,1.337925,-0.304777
e,1.536916,-2.857636
f,0.255263,1.663554


In [40]:
df1.loc['a', 'A']

1.2415667241100148

In [42]:
df1.loc[lambda df: df['B'] > 0, :]

Unnamed: 0,A,B,C,D
e,1.536916,1.338247,-2.857636,1.177048
f,0.255263,2.252035,1.663554,0.283796


In [43]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,1.241567,-0.501064
b,0.67815,-0.452368
c,0.251065,-0.31067
d,1.337925,-0.542097
e,1.536916,1.338247
f,0.255263,2.252035


In [45]:
df1.reindex([1,3,9,27,81,100])
df1

Unnamed: 0,A,B,C,D
a,1.241567,-0.501064,0.239784,-0.602825
b,0.67815,-0.452368,0.151768,2.904643
c,0.251065,-0.31067,-1.273973,-1.736657
d,1.337925,-0.542097,-0.304777,-1.697849
e,1.536916,1.338247,-2.857636,1.177048
f,0.255263,2.252035,1.663554,0.283796


## 2 Merge, Join, Concat

In [46]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])


df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']},
                   index=[8, 9, 10, 11])

frames = [df1, df2, df3]
result = pd.concat(frames)
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


##### pd.concat
    (objs, axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

In [49]:
 result = pd.concat(frames, keys=['x', 'y', 'z']);result

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [50]:
result.loc['y']

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [51]:
 df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                     'D': ['D2', 'D3', 'D6', 'D7'],
                     'F': ['F2', 'F3', 'F6', 'F7']},
                    index=[2, 3, 6, 7])
result = pd.concat([df1, df4], axis=1, sort=False);result#default is outer join on index

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [52]:
pd.concat([df1, df4], axis=1, join='inner')

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [54]:
pd.concat([df1, df4], axis=1).reindex(df1.index)

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [55]:
df1.append(df2)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [56]:
 df1.append(df4, sort=False)

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [57]:
df1.append([df2, df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


##### pd.merge
    (left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [66]:
left = pd.DataFrame({'Key': ['K0', 'K1', 'K2', 'K3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3']})

right = pd.DataFrame({'Key': ['K3', 'K4', 'K5', 'K6'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']})
pd.merge(left, right, on='Key') 
#since how='inner' by default.

Unnamed: 0,Key,B,C_x,C_y,D
0,K3,B3,C3,C4,D4


In [67]:
pd.merge(left, right, on='Key',how='outer')

Unnamed: 0,Key,B,C_x,C_y,D
0,K0,B0,C0,,
1,K1,B1,C1,,
2,K2,B2,C2,,
3,K3,B3,C3,C4,D4
4,K4,,,C5,D5
5,K5,,,C6,D6
6,K6,,,C7,D7
