# Pandas: Basics

In [417]:
import numpy as np
import pandas as pd

In [418]:
np.random.seed(101)

In [419]:
my_data=['Italy','Germany','Iran','Turkey','USA']
my_index=np.array([10,20,30,40,50])

### pd.Series(data,index)

In [420]:
ser1= pd.Series(my_data,my_index)
ser1

10      Italy
20    Germany
30       Iran
40     Turkey
50        USA
dtype: object

### Indexing: my_serie[index] 

In [421]:
ser1[20]

'Germany'

### pd.DataFrame(data,index,columns)

In [422]:
ind1='A B C D E'.split()
col1='Col1 Col2 Col3'.split()
my_df=pd.DataFrame(np.random.randn(5,3),ind1,col1)
my_df

Unnamed: 0,Col1,Col2,Col3
A,2.70685,0.628133,0.907969
B,0.503826,0.651118,-0.319318
C,-0.848077,0.605965,-2.018168
D,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237


### Index: my_DataFrame.index
### Columns: my_DataFrame.columns

In [423]:
my_df.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [424]:
my_df.columns

Index(['Col1', 'Col2', 'Col3'], dtype='object')

### Selecting Columns:
### my_DataFrame [column]
### my_DataFrame [ [ List of Columns ] ] 
#### my_DataFrame.column (SQL Method not recommended)

In [425]:
my_df['Col1']

A    2.706850
B    0.503826
C   -0.848077
D    0.740122
E    0.188695
Name: Col1, dtype: float64

In [426]:
type(my_df['Col1'])

pandas.core.series.Series

In [427]:
my_df[['Col1','Col2']]

Unnamed: 0,Col1,Col2
A,2.70685,0.628133
B,0.503826,0.651118
C,-0.848077,0.605965
D,0.740122,0.528813
E,0.188695,-0.758872


In [428]:
type(my_df[['Col1','Col2']])

pandas.core.frame.DataFrame

### New Column

In [429]:
my_df['NEW Col']= my_df['Col1']+ my_df['Col2']
my_df

Unnamed: 0,Col1,Col2,Col3,NEW Col
A,2.70685,0.628133,0.907969,3.334983
B,0.503826,0.651118,-0.319318,1.154944
C,-0.848077,0.605965,-2.018168,-0.242112
D,0.740122,0.528813,-0.589001,1.268936
E,0.188695,-0.758872,-0.933237,-0.570177


### Remove  a row or column
### my_DataFrame.drop(lable=column/row,axis=0/1,inplace=False/True)
Not inplace unless specified!

In [430]:
my_df.drop(['A','B'])

Unnamed: 0,Col1,Col2,Col3,NEW Col
C,-0.848077,0.605965,-2.018168,-0.242112
D,0.740122,0.528813,-0.589001,1.268936
E,0.188695,-0.758872,-0.933237,-0.570177


In [431]:
my_df.drop('NEW Col', axis=1, inplace=True)

In [432]:
my_df

Unnamed: 0,Col1,Col2,Col3
A,2.70685,0.628133,0.907969
B,0.503826,0.651118,-0.319318
C,-0.848077,0.605965,-2.018168
D,0.740122,0.528813,-0.589001
E,0.188695,-0.758872,-0.933237


### Selecting Rows::
### my_DataFrame.loc [ row ]
### my_DataFrame.loc [ [ list of row ] ]
### my_DataFrame.iloc [ row_index ]

In [433]:
my_df.loc['A']

Col1    2.706850
Col2    0.628133
Col3    0.907969
Name: A, dtype: float64

In [434]:
my_df.loc[['A','B']]

Unnamed: 0,Col1,Col2,Col3
A,2.70685,0.628133,0.907969
B,0.503826,0.651118,-0.319318


In [435]:
my_df.iloc[2]

Col1   -0.848077
Col2    0.605965
Col3   -2.018168
Name: C, dtype: float64

### Select a value: my_DataFrame.loc [ row , column ]

In [436]:
my_df.loc['A','Col1']

2.706849839399938

In [437]:
my_df.iloc[0,0]

2.706849839399938

### Select a subset: my_DataFrame.loc [ [ list of rows ] , [ list of columns ] ]

In [438]:
my_df.loc[['A'],['Col1']]

Unnamed: 0,Col1
A,2.70685


In [439]:
my_df.loc[['A','C'],['Col2','Col3']]

Unnamed: 0,Col2,Col3
A,0.628133,0.907969
C,0.605965,-2.018168


In [440]:
my_df.iloc[[0,2],1:]

Unnamed: 0,Col2,Col3
A,0.628133,0.907969
C,0.605965,-2.018168


In [441]:
#even rows
my_df.iloc[::2]

Unnamed: 0,Col1,Col2,Col3
A,2.70685,0.628133,0.907969
C,-0.848077,0.605965,-2.018168
E,0.188695,-0.758872,-0.933237


### Conditional Selection

In [442]:
my_df>0

Unnamed: 0,Col1,Col2,Col3
A,True,True,True
B,True,True,False
C,False,True,False
D,True,True,False
E,True,False,False


In [443]:
my_df[my_df>0]

Unnamed: 0,Col1,Col2,Col3
A,2.70685,0.628133,0.907969
B,0.503826,0.651118,
C,,0.605965,
D,0.740122,0.528813,
E,0.188695,,


In [444]:
my_df[my_df['Col2']>0][['Col2','Col3']]

Unnamed: 0,Col2,Col3
A,0.628133,0.907969
B,0.651118,-0.319318
C,0.605965,-2.018168
D,0.528813,-0.589001


#### and --> &
#### or --> |

In [445]:
my_df[(my_df['Col2']>0) & (my_df['Col1']>0)].iloc[1:,::-1]

Unnamed: 0,Col3,Col2,Col1
B,-0.319318,0.651118,0.503826
D,-0.589001,0.528813,0.740122


### my_DataFrame.reset_index(inplce=False/True)
### my_DataFrame.set_index(Column)

In [446]:
my_df['NEW']='Day1 Day2 Day3 Day4 Day5'.split()
my_df

Unnamed: 0,Col1,Col2,Col3,NEW
A,2.70685,0.628133,0.907969,Day1
B,0.503826,0.651118,-0.319318,Day2
C,-0.848077,0.605965,-2.018168,Day3
D,0.740122,0.528813,-0.589001,Day4
E,0.188695,-0.758872,-0.933237,Day5


In [447]:
#reset to default 0,1,2,...
my_df.reset_index(inplace=True)

In [448]:
my_df

Unnamed: 0,index,Col1,Col2,Col3,NEW
0,A,2.70685,0.628133,0.907969,Day1
1,B,0.503826,0.651118,-0.319318,Day2
2,C,-0.848077,0.605965,-2.018168,Day3
3,D,0.740122,0.528813,-0.589001,Day4
4,E,0.188695,-0.758872,-0.933237,Day5


In [449]:
my_df.set_index('NEW',inplace=True)

In [450]:
my_df

Unnamed: 0_level_0,index,Col1,Col2,Col3
NEW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Day1,A,2.70685,0.628133,0.907969
Day2,B,0.503826,0.651118,-0.319318
Day3,C,-0.848077,0.605965,-2.018168
Day4,D,0.740122,0.528813,-0.589001
Day5,E,0.188695,-0.758872,-0.933237


### Multi-Index and Index Hierarchy

In [451]:
level_1=['G1','G1','G1','G1','G1','G2','G2','G2','G2','G2']
level_2=['SG1','SG1','SG2','SG2','SG2','SG1','SG1','SG1','SG2','SG2']
# level_3=list(range(10))
level_3=[1,2,3,4,5,1,2,3,4,5]
cols=[i for i in 'ABCDEFGHIJ']

In [452]:
my_hierarchy=list(zip(level_1,level_2,level_3))

In [453]:
hier_index=pd.MultiIndex.from_tuples(my_hierarchy)
hier_index

MultiIndex(levels=[['G1', 'G2'], ['SG1', 'SG2'], [1, 2, 3, 4, 5]],
           codes=[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 0, 0, 0, 1, 1], [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]])

In [454]:
df=pd.DataFrame(np.random.randn(10,10),hier_index,cols)
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,A,B,C,D,E,F,G,H,I,J
G1,SG1,1,0.955057,0.190794,1.978757,2.605967,0.683509,0.302665,1.693723,-1.706086,-1.159119,-0.134841
G1,SG1,2,0.390528,0.166905,0.184502,0.807706,0.07296,0.638787,0.329646,-0.497104,-0.75407,-0.943406
G1,SG2,3,0.484752,-0.116773,1.901755,0.238127,1.996652,-0.993263,0.1968,-1.136645,0.000366,1.025984
G1,SG2,4,-0.156598,-0.031579,0.649826,2.154846,-0.610259,-0.755325,-0.346419,0.147027,-0.479448,0.558769
G1,SG2,5,1.02481,-0.925874,1.862864,-1.133817,0.610478,0.38603,2.084019,-0.376519,0.230336,0.681209
G2,SG1,1,1.035125,-0.03116,1.939932,-1.005187,-0.74179,0.187125,-0.732845,-1.38292,1.482495,0.961458
G2,SG1,2,-2.141212,0.992573,1.192241,-1.04678,1.292765,-1.467514,-0.494095,-0.162535,0.485809,0.392489
G2,SG1,3,0.221491,-0.855196,1.54199,0.666319,-0.538235,-0.568581,1.407338,0.641806,-0.9051,-0.391157
G2,SG2,4,1.028293,-1.972605,-0.866885,0.720788,-1.223082,1.60678,-1.11571,-1.385379,-1.32966,0.04146
G2,SG2,5,-0.411055,-0.771329,0.110477,-0.804652,0.253548,0.649148,0.358941,-1.080471,0.902398,0.161781


In [455]:
df['A']

G1  SG1  1    0.955057
         2    0.390528
    SG2  3    0.484752
         4   -0.156598
         5    1.024810
G2  SG1  1    1.035125
         2   -2.141212
         3    0.221491
    SG2  4    1.028293
         5   -0.411055
Name: A, dtype: float64

In [456]:
df.loc['G1']

Unnamed: 0,Unnamed: 1,A,B,C,D,E,F,G,H,I,J
SG1,1,0.955057,0.190794,1.978757,2.605967,0.683509,0.302665,1.693723,-1.706086,-1.159119,-0.134841
SG1,2,0.390528,0.166905,0.184502,0.807706,0.07296,0.638787,0.329646,-0.497104,-0.75407,-0.943406
SG2,3,0.484752,-0.116773,1.901755,0.238127,1.996652,-0.993263,0.1968,-1.136645,0.000366,1.025984
SG2,4,-0.156598,-0.031579,0.649826,2.154846,-0.610259,-0.755325,-0.346419,0.147027,-0.479448,0.558769
SG2,5,1.02481,-0.925874,1.862864,-1.133817,0.610478,0.38603,2.084019,-0.376519,0.230336,0.681209


In [457]:
df.loc['G1'].loc['SG1']

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
1,0.955057,0.190794,1.978757,2.605967,0.683509,0.302665,1.693723,-1.706086,-1.159119,-0.134841
2,0.390528,0.166905,0.184502,0.807706,0.07296,0.638787,0.329646,-0.497104,-0.75407,-0.943406


In [458]:
df.iloc[1]

A    0.390528
B    0.166905
C    0.184502
D    0.807706
E    0.072960
F    0.638787
G    0.329646
H   -0.497104
I   -0.754070
J   -0.943406
Name: (G1, SG1, 2), dtype: float64

In [459]:
df.loc['G2'].iloc[2]

A    0.221491
B   -0.855196
C    1.541990
D    0.666319
E   -0.538235
F   -0.568581
G    1.407338
H    0.641806
I   -0.905100
J   -0.391157
Name: (SG1, 3), dtype: float64

In [460]:
df.loc['G1'].loc['SG1']['C']

1    1.978757
2    0.184502
Name: C, dtype: float64

### Index Names: my_DataFrame.index.names

In [461]:
df.index.names

FrozenList([None, None, None])

In [462]:
df.index.names = ['Group','Subgroup','Numbers']

In [463]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D,E,F,G,H,I,J
Group,Subgroup,Numbers,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
G1,SG1,1,0.955057,0.190794,1.978757,2.605967,0.683509,0.302665,1.693723,-1.706086,-1.159119,-0.134841
G1,SG1,2,0.390528,0.166905,0.184502,0.807706,0.07296,0.638787,0.329646,-0.497104,-0.75407,-0.943406
G1,SG2,3,0.484752,-0.116773,1.901755,0.238127,1.996652,-0.993263,0.1968,-1.136645,0.000366,1.025984
G1,SG2,4,-0.156598,-0.031579,0.649826,2.154846,-0.610259,-0.755325,-0.346419,0.147027,-0.479448,0.558769
G1,SG2,5,1.02481,-0.925874,1.862864,-1.133817,0.610478,0.38603,2.084019,-0.376519,0.230336,0.681209
G2,SG1,1,1.035125,-0.03116,1.939932,-1.005187,-0.74179,0.187125,-0.732845,-1.38292,1.482495,0.961458
G2,SG1,2,-2.141212,0.992573,1.192241,-1.04678,1.292765,-1.467514,-0.494095,-0.162535,0.485809,0.392489
G2,SG1,3,0.221491,-0.855196,1.54199,0.666319,-0.538235,-0.568581,1.407338,0.641806,-0.9051,-0.391157
G2,SG2,4,1.028293,-1.972605,-0.866885,0.720788,-1.223082,1.60678,-1.11571,-1.385379,-1.32966,0.04146
G2,SG2,5,-0.411055,-0.771329,0.110477,-0.804652,0.253548,0.649148,0.358941,-1.080471,0.902398,0.161781


### Cross Section: my_DataFrame.xs(index_name,level=)

In [464]:
df.xs('SG1',level='Subgroup')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,E,F,G,H,I,J
Group,Numbers,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G1,1,0.955057,0.190794,1.978757,2.605967,0.683509,0.302665,1.693723,-1.706086,-1.159119,-0.134841
G1,2,0.390528,0.166905,0.184502,0.807706,0.07296,0.638787,0.329646,-0.497104,-0.75407,-0.943406
G2,1,1.035125,-0.03116,1.939932,-1.005187,-0.74179,0.187125,-0.732845,-1.38292,1.482495,0.961458
G2,2,-2.141212,0.992573,1.192241,-1.04678,1.292765,-1.467514,-0.494095,-0.162535,0.485809,0.392489
G2,3,0.221491,-0.855196,1.54199,0.666319,-0.538235,-0.568581,1.407338,0.641806,-0.9051,-0.391157
