In [1]:
import numpy as np
import pandas as pd

# Series

In [2]:
# Trivial Series data object with default indices
a = pd.Series([34,23,56,76,212,56,776])
print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))

a=0     34
1     23
2     56
3     76
4    212
5     56
6    776
dtype: int64
Index=RangeIndex(start=0, stop=7, step=1), Values=[ 34  23  56  76 212  56 776]

a[3]=76
Slicing is also same as in numpy a[2:6]=2     56
3     76
4    212
5     56
dtype: int64


In [3]:
# Series object with custom indices
a = pd.Series([34,23,56,76,212,56,776], index=["A","B","C","D","E","F","NA"])

print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("a[NA]={}".format(a["NA"]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))
print("Slicing is also same as in numpy a[B:E]={}".format(a["B":"E"]))

a=A      34
B      23
C      56
D      76
E     212
F      56
NA    776
dtype: int64
Index=Index(['A', 'B', 'C', 'D', 'E', 'F', 'NA'], dtype='object'), Values=[ 34  23  56  76 212  56 776]

a[3]=76
a[NA]=776
Slicing is also same as in numpy a[2:6]=C     56
D     76
E    212
F     56
dtype: int64
Slicing is also same as in numpy a[B:E]=B     23
C     56
D     76
E    212
dtype: int64


# DataFrame

In [4]:
corigin = pd.Series(["egypt","india","pakistan", "china","india", "tibet","us"], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
length = pd.Series([10434,453445,2132,324325,1213,343,5435], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
ncountries = pd.Series([3,6,2,8,9,2,1], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
significance = pd.Series([10,9,8,7,6,5,4], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
test = pd.Series([10,9,6,5,4], index=["nile","ganga","yangtze","mekong","mississippi"]) # Notice in sufficient data in this column

rivers=pd.DataFrame({"Origin":corigin, "Length":length, "Num-Countries":ncountries, "Local-Significance":significance, "Test":test})
print(rivers)

# Accessing all the indices and values
print("Indices={}\nValues=\n{}\n".format(rivers.index, rivers.values))
print("rivers['Origin']=\n{}\n".format(rivers['Origin']))

# Access part of the dataframe
print("rivers['ganga':'nile']=\n{}\n".format(rivers['ganga':'nile']))

# Accessing via a mask
print("rivers[rivers.Length > 343]=\n{}\n\n".format(rivers[rivers.Length > 343]))

# Accessing an entire row
print("row 'ganga' of rivers=\n{}".format(rivers.T['ganga']))

               Origin  Length  Num-Countries  Local-Significance  Test
brahmaputra     china  324325              8                   7   NaN
ganga           india  453445              6                   9   9.0
indus        pakistan    2132              2                   8   NaN
mekong          tibet     343              2                   5   5.0
mississippi        us    5435              1                   4   4.0
nile            egypt   10434              3                  10  10.0
yangtze         india    1213              9                   6   6.0
Indices=Index(['brahmaputra', 'ganga', 'indus', 'mekong', 'mississippi', 'nile',
       'yangtze'],
      dtype='object')
Values=
[['china' 324325 8 7 nan]
 ['india' 453445 6 9 9.0]
 ['pakistan' 2132 2 8 nan]
 ['tibet' 343 2 5 5.0]
 ['us' 5435 1 4 4.0]
 ['egypt' 10434 3 10 10.0]
 ['india' 1213 9 6 6.0]]

rivers['Origin']=
brahmaputra       china
ganga             india
indus          pakistan
mekong            tibet
mississippi 

In [5]:
a = pd.DataFrame(np.random.randint(1,100,size=(2,4)), columns=list('ABCD'))
a

Unnamed: 0,A,B,C,D
0,64,72,28,32
1,23,24,46,23


## Addition

In [6]:
a = pd.DataFrame(np.random.randint(1,20,size=(2,3)), columns=['A1', 'A2', 'A3'])
b = pd.DataFrame(np.random.randint(1,20,size=(5,3)))
print("a=\n{}\n\nb=\n{}\n\na+b=\n{}\n\n".format(a,b,a+b))

b = pd.DataFrame(np.random.randint(1,20,size=(5,3)), columns=['A1', 'A2', 'A3'])
print("b=\n{}\n\na+b=\n{}\n\na+b(with zero filling)=\n{}".format(b,a+b,a.add(b,fill_value=0)))

a=
   A1  A2  A3
0   3  13  13
1   4   6   2

b=
    0   1   2
0  18  12   9
1  13  14  11
2   4  10   9
3   8   5   4
4   1  16  19

a+b=
   A1  A2  A3   0   1   2
0 NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN


b=
   A1  A2  A3
0  13  10   5
1  12   3   9
2   8  15   1
3   2  11  18
4   3  18   3

a+b=
     A1    A2    A3
0  16.0  23.0  18.0
1  16.0   9.0  11.0
2   NaN   NaN   NaN
3   NaN   NaN   NaN
4   NaN   NaN   NaN

a+b(with zero filling)=
     A1    A2    A3
0  16.0  23.0  18.0
1  16.0   9.0  11.0
2   8.0  15.0   1.0
3   2.0  11.0  18.0
4   3.0  18.0   3.0


## From numpy arrays

In [7]:
a = pd.DataFrame(np.random.rand(3,2), columns=["A", "B"], index=["ID1","ID2","ID3"])
b = pd.DataFrame(np.random.randint(6,size=(3,2)), columns=["A", "B"], index=["ID1","ID2","ID3"])
print("a=\n{}\n\nb=\n{}\n".format(a,b))
print("a['A']=\n{}".format(a["A"])) # Note the difference to the structured arrays of numpy. In case of structured
                                    # arrays of numpy, the rows can be accessed by indexing where as in this case
                                    # it is not possible

# Note pandas index object is immutable. They can be accessed but cannot be modified
i = a.index
print("\nIndices of a=\n{}".format(i))
print("Accessing an index is fine, i[2]={}".format(i[2]))
# But changing an index value is not allowed ... The following will give an error
#i[2] = "New Label"

a=
            A         B
ID1  0.945854  0.601032
ID2  0.522725  0.311315
ID3  0.311206  0.369053

b=
     A  B
ID1  5  4
ID2  0  1
ID3  2  1

a['A']=
ID1    0.945854
ID2    0.522725
ID3    0.311206
Name: A, dtype: float64

Indices of a=
Index(['ID1', 'ID2', 'ID3'], dtype='object')
Accessing an index is fine, i[2]=ID3


# Accessing using indexes: loc, iloc

In [8]:
# Series data
a = pd.Series([34,23,56,76,212,56,776], index=["A","B","C","D","E","F","NA"])
print(a)
# Notice the 'i' in iloc which stands for implicit
print("a.loc[C]={}, a.iloc[2]={}".format(a.loc["C"], a.iloc[2]))

A      34
B      23
C      56
D      76
E     212
F      56
NA    776
dtype: int64
a.loc[C]=56, a.iloc[2]=56


# Dealing with NaN values

In [9]:
# Example-1: Series data
a = pd.Series(range(9))
for idx in [1,3,5,7]:
    a[idx] = np.nan

print("a={}\n\na.dropna()={}\n\na.fillna(437)={}".format(a, a.dropna(), a.fillna(437)))


# Example-2: DataFrames
a = pd.DataFrame(np.random.randint(1,50,size=(4,6)), columns=['A', 'B', 'C', 'D', 'E', 'F'])
print("\n\na={}".format(a))
a[2] = pd.Series([1, 5, 99, np.nan, 33, np.nan])
print("\n\na={}".format(a))

a=0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
5    NaN
6    6.0
7    NaN
8    8.0
dtype: float64

a.dropna()=0    0.0
2    2.0
4    4.0
6    6.0
8    8.0
dtype: float64

a.fillna(437)=0      0.0
1    437.0
2      2.0
3    437.0
4      4.0
5    437.0
6      6.0
7    437.0
8      8.0
dtype: float64


a=    A   B   C   D   E   F
0  10   9   3  43  29   6
1  39  46   9  14  38  46
2   3   7  29  32  10   1
3  13  21  21  20  47   2


a=    A   B   C   D   E   F     2
0  10   9   3  43  29   6   1.0
1  39  46   9  14  38  46   5.0
2   3   7  29  32  10   1  99.0
3  13  21  21  20  47   2   NaN


# MultiIndex
## DataFrame

In [10]:
# Creating a DataFrame
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.DataFrame(populations, index=newIndex, columns=["Population"])
print("pop=\n{}\n\npop[\"Population\"]=\n{}\n\npop.Population=\n{}".format(pop, pop["Population"], pop.Population))
# print(pop); print("\n"); print(pop["Population"]); print("\n"); print(pop.Population)

pop=
                 Population
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561

pop["Population"]=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64

pop.Population=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64


In [11]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'c'], [1, 2, 3, 4]], columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.901866,0.862136
a,2,0.877617,0.911472
b,3,0.591885,0.182322
c,4,0.041379,0.973236


In [12]:
a = pd.DataFrame(np.random.randint(50, size=(3,6))
                 , index=pd.MultiIndex.from_arrays([["R1", "R2", "R3"], ["R1.1", "R1.2", "R2.3"]], names=["TopRow", "LowerRow"])
                 , columns=pd.MultiIndex.from_arrays([["H1", "H2", "H3", "H4", "H5", "H6"], ["H1.1", "H1.2", "H1.3", "H1.4", "H1.5", "H1.6"]], names=["TopCol", "LowerCol"]))
a

Unnamed: 0_level_0,TopCol,H1,H2,H3,H4,H5,H6
Unnamed: 0_level_1,LowerCol,H1.1,H1.2,H1.3,H1.4,H1.5,H1.6
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
R1,R1.1,44,37,38,14,47,14
R2,R1.2,15,43,33,30,47,26
R3,R2.3,31,34,38,4,47,1


In [13]:
a = pd.DataFrame(np.random.randint(60, size=(6,10))
                 , index=pd.MultiIndex.from_product([["R1", "R2", "R3"], [0,1]], names=["TopRow", "LowerRow"])
                 , columns=pd.MultiIndex.from_product([["H1", "H2", "H3", "H4", "H5"], ['Foo','Doo']], names=["TopCol", "LowerCol"]))
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,21,39,46,51,59,38,48,12,57,41
R1,1,31,12,13,55,5,20,54,28,35,22
R2,0,0,31,15,51,32,5,51,6,44,56
R2,1,27,16,16,30,53,23,5,30,2,24
R3,0,41,32,0,50,46,14,30,5,27,43
R3,1,28,33,49,23,47,45,40,34,40,10


In [14]:
a["H2"]['Doo']

TopRow  LowerRow
R1      0           51
        1           55
R2      0           51
        1           30
R3      0           50
        1           23
Name: Doo, dtype: int32

## Stacking, reset_index

* axis has the same meaning as in the case of numpy arrays
* levels are a new concept using in multiindexed panda arrays:
        * level=0: The upper most level of hierarchy
        * level=1: The hierarchy level one below the upper most one 
        * level=n: The nth level of hierarchy relative to the topmost item
* To represent something uniquely we need the axis and the level at the same time. 
        * Example: axis=0 and level=0, will represent the top most rows
        * Example: axis=1 and level=0, will represent the top most columns
* One can also represent the level in terms of its name. Example: 'TopCol' represents level=0, 'LowerCol' represents level=1.

In [30]:
a.unstack(level=0) # Unstack based on the outermost index or index level 0 (in this case 'TopRow' (R1, R2, and R3))

TopCol,H1,H1,H1,H1,H1,H1,H2,H2,H2,H2,...,H4,H4,H4,H4,H5,H5,H5,H5,H5,H5
LowerCol,Foo,Foo,Foo,Doo,Doo,Doo,Foo,Foo,Foo,Doo,...,Foo,Doo,Doo,Doo,Foo,Foo,Foo,Doo,Doo,Doo
TopRow,R1,R2,R3,R1,R2,R3,R1,R2,R3,R1,...,R3,R1,R2,R3,R1,R2,R3,R1,R2,R3
LowerRow,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,21,0,41,39,31,32,46,15,0,51,...,30,12,6,5,57,44,27,41,56,43
1,31,27,28,12,16,33,13,16,49,55,...,40,28,30,34,35,2,40,22,24,10


In [16]:
a.unstack(level=1)  # Unstack based on the index level 1 (in this case 'LowerRow' (0 and 1))

TopCol,H1,H1,H1,H1,H2,H2,H2,H2,H3,H3,H3,H3,H4,H4,H4,H4,H5,H5,H5,H5
LowerCol,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo
LowerRow,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
TopRow,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
R1,21,31,39,12,46,13,51,55,59,5,38,20,48,54,12,28,57,35,41,22
R2,0,27,31,16,15,16,51,30,32,53,5,23,51,5,6,30,44,2,56,24
R3,41,28,32,33,0,49,50,23,46,47,14,45,30,40,5,34,27,40,43,10


In [17]:
b = a.reset_index() # Remove the multi-indices and convert them to columns
print(a); print(b) # Notice that after the reset, the 'TopRow' and 'LowerRow' which were reset 
                   # due to the command have no 'LowerCol' (obviously), as they were not defined. 

TopCol           H1      H2      H3      H4      H5    
LowerCol        Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
TopRow LowerRow                                        
R1     0         21  39  46  51  59  38  48  12  57  41
       1         31  12  13  55   5  20  54  28  35  22
R2     0          0  31  15  51  32   5  51   6  44  56
       1         27  16  16  30  53  23   5  30   2  24
R3     0         41  32   0  50  46  14  30   5  27  43
       1         28  33  49  23  47  45  40  34  40  10
TopCol   TopRow LowerRow  H1      H2      H3      H4      H5    
LowerCol                 Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
0            R1        0  21  39  46  51  59  38  48  12  57  41
1            R1        1  31  12  13  55   5  20  54  28  35  22
2            R2        0   0  31  15  51  32   5  51   6  44  56
3            R2        1  27  16  16  30  53  23   5  30   2  24
4            R3        0  41  32   0  50  46  14  30   5  27  43
5            R3        1  28  33  49  23 

In [18]:
# b['TopRow']['Foo'] --> This will result in an error. No 'Foo' defined for 'TopRow' after the reset
b['TopRow'][0]

'R1'

In [21]:
b.set_index(['TopRow', 'LowerRow']) # Setting 'TopRow' and 'LowerRow' as indices again.
                                    # This should result in the same original array
# b.set_index(['H1', 'H2']) --> Will result in an error (Most likely because they have lower indices 'Foo' and 'Doo')

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,21,39,46,51,59,38,48,12,57,41
R1,1,31,12,13,55,5,20,54,28,35,22
R2,0,0,31,15,51,32,5,51,6,44,56
R2,1,27,16,16,30,53,23,5,30,2,24
R3,0,41,32,0,50,46,14,30,5,27,43
R3,1,28,33,49,23,47,45,40,34,40,10


In [23]:
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,21,39,46,51,59,38,48,12,57,41
R1,1,31,12,13,55,5,20,54,28,35,22
R2,0,0,31,15,51,32,5,51,6,44,56
R2,1,27,16,16,30,53,23,5,30,2,24
R3,0,41,32,0,50,46,14,30,5,27,43
R3,1,28,33,49,23,47,45,40,34,40,10


In [24]:
a.mean(level='TopRow') # Get mean of each level of type 'TopRow'

TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
R1,26.0,25.5,29.5,53.0,32.0,29.0,51.0,20.0,46.0,31.5
R2,13.5,23.5,15.5,40.5,42.5,14.0,28.0,18.0,23.0,40.0
R3,34.5,32.5,24.5,36.5,46.5,29.5,35.0,19.5,33.5,26.5


In [27]:
a.mean(axis=1, level='LowerCol') # Take the average of all the 'Foo' sub-columns (level=1)

Unnamed: 0_level_0,LowerCol,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,0,46.2,36.2
R1,1,27.6,27.4
R2,0,28.4,29.8
R2,1,20.6,24.6
R3,0,28.8,28.8
R3,1,40.8,29.0


In [28]:
a.mean(axis=1, level=1) # Same as level='LowerCol'

Unnamed: 0_level_0,LowerCol,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,0,46.2,36.2
R1,1,27.6,27.4
R2,0,28.4,29.8
R2,1,20.6,24.6
R3,0,28.8,28.8
R3,1,40.8,29.0


In [29]:
a.mean(axis=1, level=0) # Taking the average at level=0 and axis=1

Unnamed: 0_level_0,TopCol,H1,H2,H3,H4,H5
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
R1,0,30.0,48.5,48.5,30.0,49.0
R1,1,21.5,34.0,12.5,41.0,28.5
R2,0,15.5,33.0,18.5,28.5,50.0
R2,1,21.5,23.0,38.0,17.5,13.0
R3,0,36.5,25.0,30.0,17.5,35.0
R3,1,30.5,36.0,46.0,37.0,25.0


## Concatenation

In [31]:
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,21,39,46,51,59,38,48,12,57,41
R1,1,31,12,13,55,5,20,54,28,35,22
R2,0,0,31,15,51,32,5,51,6,44,56
R2,1,27,16,16,30,53,23,5,30,2,24
R3,0,41,32,0,50,46,14,30,5,27,43
R3,1,28,33,49,23,47,45,40,34,40,10


In [38]:
pd.concat([a1, a2]) # Default, axis=0 (meaning concatenation along rows)

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,21,39,46,51,59,38,48,12,57,41
R1,1,31,12,13,55,5,20,54,28,35,22
R2,0,0,31,15,51,32,5,51,6,44,56
R2,1,27,16,16,30,53,23,5,30,2,24
R3,0,41,32,0,50,46,14,30,5,27,43
R3,1,28,33,49,23,47,45,40,34,40,10


In [40]:
pd.concat([a1,a2], axis=1) # Concatenation along columns. Observe how the unknown values are replaced with 'NaN'

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
R1,0,21.0,39.0,46.0,51.0,59.0,38.0,48.0,12.0,57.0,41.0,,,,,,,,,,
R1,1,31.0,12.0,13.0,55.0,5.0,20.0,54.0,28.0,35.0,22.0,,,,,,,,,,
R2,0,0.0,31.0,15.0,51.0,32.0,5.0,51.0,6.0,44.0,56.0,,,,,,,,,,
R2,1,27.0,16.0,16.0,30.0,53.0,23.0,5.0,30.0,2.0,24.0,,,,,,,,,,
R3,0,,,,,,,,,,,41.0,32.0,0.0,50.0,46.0,14.0,30.0,5.0,27.0,43.0
R3,1,,,,,,,,,,,28.0,33.0,49.0,23.0,47.0,45.0,40.0,34.0,40.0,10.0


## Series

In [None]:
# Creating a Series
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2015)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=newIndex)
print("pop=\n{}\n\npop[:,2015]=\n{}\n\npop.unstack()=\n{}\n".format(pop, pop[:,2015], pop.unstack()))

# Creating a DataFrame using a series
pop_df = pd.DataFrame({'total': pop, 'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
print("pop_df=\n{}\n".format(pop_df))
# Note if the DataFrame has more than 1 columns, then indexing as shown below is not possible
#print("pop_df['Texas',2015]=\n{}".format(pop_df['Texas',2015]))
print("pop['Texas',2015]=\n{}\n".format(pop['Texas',2015]))
print("pop[:,2010]=\n{}\n".format(pop[:,2010]))
print("pop['Texas':'California']=\n{}\n".format(pop['Texas':'California']))
print("pop['California':'Texas']=\n{}\n".format(pop['California':'Texas']))