In [1]:
import numpy as np
import pandas as pd

from IPython.display import display, HTML

# Series

In [2]:
# Trivial Series data object with default indices
a = pd.Series([34,23,56,76,212,56,776])
print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))

a=0     34
1     23
2     56
3     76
4    212
5     56
6    776
dtype: int64
Index=RangeIndex(start=0, stop=7, step=1), Values=[ 34  23  56  76 212  56 776]

a[3]=76
Slicing is also same as in numpy a[2:6]=2     56
3     76
4    212
5     56
dtype: int64


In [3]:
# Series object with custom indices
a = pd.Series([34,23,56,76,212,56,776], index=["A","B","C","D","E","F","NA"])

print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("a[NA]={}".format(a["NA"]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))
print("Slicing is also same as in numpy a[B:E]={}".format(a["B":"E"]))

a=A      34
B      23
C      56
D      76
E     212
F      56
NA    776
dtype: int64
Index=Index(['A', 'B', 'C', 'D', 'E', 'F', 'NA'], dtype='object'), Values=[ 34  23  56  76 212  56 776]

a[3]=76
a[NA]=776
Slicing is also same as in numpy a[2:6]=C     56
D     76
E    212
F     56
dtype: int64
Slicing is also same as in numpy a[B:E]=B     23
C     56
D     76
E    212
dtype: int64


# DataFrame

In [4]:
corigin = pd.Series(["egypt","india","pakistan", "china","india", "tibet","us"], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
length = pd.Series([10434,453445,2132,324325,1213,343,5435], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
ncountries = pd.Series([3,6,2,8,9,2,1], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
significance = pd.Series([10,9,8,7,6,5,4], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
test = pd.Series([10,9,6,5,4], index=["nile","ganga","yangtze","mekong","mississippi"]) # Notice in sufficient data in this column

rivers=pd.DataFrame({"Origin":corigin, "Length":length, "Num-Countries":ncountries, "Local-Significance":significance, "Test":test})
print(rivers)

# Accessing all the indices and values
print("Indices={}\nValues=\n{}\n".format(rivers.index, rivers.values))
print("rivers['Origin']=\n{}\n".format(rivers['Origin']))

# Access part of the dataframe
print("rivers['ganga':'nile']=\n{}\n".format(rivers['ganga':'nile']))

# Accessing via a mask
print("rivers[rivers.Length > 343]=\n{}\n\n".format(rivers[rivers.Length > 343]))

# Accessing an entire row
print("row 'ganga' of rivers=\n{}".format(rivers.T['ganga']))

               Origin  Length  Num-Countries  Local-Significance  Test
brahmaputra     china  324325              8                   7   NaN
ganga           india  453445              6                   9   9.0
indus        pakistan    2132              2                   8   NaN
mekong          tibet     343              2                   5   5.0
mississippi        us    5435              1                   4   4.0
nile            egypt   10434              3                  10  10.0
yangtze         india    1213              9                   6   6.0
Indices=Index(['brahmaputra', 'ganga', 'indus', 'mekong', 'mississippi', 'nile',
       'yangtze'],
      dtype='object')
Values=
[['china' 324325 8 7 nan]
 ['india' 453445 6 9 9.0]
 ['pakistan' 2132 2 8 nan]
 ['tibet' 343 2 5 5.0]
 ['us' 5435 1 4 4.0]
 ['egypt' 10434 3 10 10.0]
 ['india' 1213 9 6 6.0]]

rivers['Origin']=
brahmaputra       china
ganga             india
indus          pakistan
mekong            tibet
mississippi 

In [5]:
a = pd.DataFrame(np.random.randint(1,100,size=(2,4)), columns=list('ABCD'))
a

Unnamed: 0,A,B,C,D
0,5,75,98,29
1,33,52,62,14


## Addition

In [6]:
a = pd.DataFrame(np.random.randint(1,20,size=(2,3)), columns=['A1', 'A2', 'A3'])
b = pd.DataFrame(np.random.randint(1,20,size=(5,3)))
print("a=\n{}\n\nb=\n{}\n\na+b=\n{}\n\n".format(a,b,a+b))

b = pd.DataFrame(np.random.randint(1,20,size=(5,3)), columns=['A1', 'A2', 'A3'])
print("b=\n{}\n\na+b=\n{}\n\na+b(with zero filling)=\n{}".format(b,a+b,a.add(b,fill_value=0)))

a=
   A1  A2  A3
0  11  10  17
1  13  17  14

b=
    0   1   2
0  16   5   1
1  10  12  19
2  16  19  12
3   9   5   2
4   9  11   6

a+b=
   A1  A2  A3   0   1   2
0 NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN


b=
   A1  A2  A3
0  18  11  13
1   4  16  12
2   1  16   8
3  17  11  16
4  12  18  12

a+b=
     A1    A2    A3
0  29.0  21.0  30.0
1  17.0  33.0  26.0
2   NaN   NaN   NaN
3   NaN   NaN   NaN
4   NaN   NaN   NaN

a+b(with zero filling)=
     A1    A2    A3
0  29.0  21.0  30.0
1  17.0  33.0  26.0
2   1.0  16.0   8.0
3  17.0  11.0  16.0
4  12.0  18.0  12.0


## From numpy arrays

In [7]:
a = pd.DataFrame(np.random.rand(3,2), columns=["A", "B"], index=["ID1","ID2","ID3"])
b = pd.DataFrame(np.random.randint(6,size=(3,2)), columns=["A", "B"], index=["ID1","ID2","ID3"])
print("a=\n{}\n\nb=\n{}\n".format(a,b))
print("a['A']=\n{}".format(a["A"])) # Note the difference to the structured arrays of numpy. In case of structured
                                    # arrays of numpy, the rows can be accessed by indexing where as in this case
                                    # it is not possible

# Note pandas index object is immutable. They can be accessed but cannot be modified
i = a.index
print("\nIndices of a=\n{}".format(i))
print("Accessing an index is fine, i[2]={}".format(i[2]))
# But changing an index value is not allowed ... The following will give an error
#i[2] = "New Label"

a=
            A         B
ID1  0.352926  0.946224
ID2  0.136489  0.023194
ID3  0.500360  0.246473

b=
     A  B
ID1  3  5
ID2  5  1
ID3  1  0

a['A']=
ID1    0.352926
ID2    0.136489
ID3    0.500360
Name: A, dtype: float64

Indices of a=
Index(['ID1', 'ID2', 'ID3'], dtype='object')
Accessing an index is fine, i[2]=ID3


# Accessing using indexes: loc, iloc

In [70]:
# Series data
a = pd.Series([34,23,56,76,212,56,776], index=[11,22,33,44,55,66,None])
print(a,'\n');
# Notice the 'i' in iloc which stands for implicit
# 'loc' --> Explicit indexing
# 'iloc' --> Implicit indexing
print("Explicit indexing as defined by the index field: a.loc[33]={}".format(a.loc[33]))
print("Implicit indexing of the DF: a.iloc[2]={}".format(a.iloc[2]))


#######################################################################################
# DataFrame
#######################################################################################
np.random.seed(32)
b = pd.DataFrame(np.random.randint(1,100,size=(4,6)), index=['A', 'B', 'C', 'D'], columns=['Ht', 'Wt', 'Gt', 'Ag', 'Qp', 'Zi'])
print("b=\n{}\n\n b.loc['B']=\n{}\n\n b.loc[b['Gt'] > 80]=\n{} \n\n b.loc[b['Gt'] > 80, 'Zi']=\n{} \n\n b.loc[b['Gt'] > 80, ['Zi', 'Wt']]=\n{}".format(b, b.loc['B'], b.loc[b['Gt'] > 80], b.loc[b['Gt'] > 80, 'Zi'], b.loc[b['Gt'] > 80, ['Zi', 'Wt']]))

11      34
22      23
33      56
44      76
55     212
66      56
NaN    776
dtype: int64 

Explicit indexing as defined by the index field: a.loc[33]=56
Implicit indexing of the DF: a.iloc[2]=56
b=
   Ht  Wt  Gt  Ag  Qp  Zi
A  88  44   6  55  63  89
B  20  72  90   4  10   5
C  12  82   4  66  35  36
D  25  43  83  11  85  51

 b.loc['B']=
Ht    20
Wt    72
Gt    90
Ag     4
Qp    10
Zi     5
Name: B, dtype: int32

 b.loc[b['Gt'] > 80]=
   Ht  Wt  Gt  Ag  Qp  Zi
B  20  72  90   4  10   5
D  25  43  83  11  85  51 

 b.loc[b['Gt'] > 80, 'Zi']=
B     5
D    51
Name: Zi, dtype: int32 

 b.loc[b['Gt'] > 80, ['Zi', 'Wt']]=
   Zi  Wt
B   5  72
D  51  43


# Dealing with NaN values

In [9]:
# Example-1: Series data
a = pd.Series(range(9))
for idx in [1,3,5,7]:
    a[idx] = np.nan

print("a={}\n\na.dropna()={}\n\na.fillna(437)={}".format(a, a.dropna(), a.fillna(437)))


# Example-2: DataFrames
a = pd.DataFrame(np.random.randint(1,50,size=(4,6)), columns=['A', 'B', 'C', 'D', 'E', 'F'])
print(a,'\n')
a[2] = pd.Series([1, 5, 99, np.nan, 33, np.nan])
print(a,'\n')

a=0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
5    NaN
6    6.0
7    NaN
8    8.0
dtype: float64

a.dropna()=0    0.0
2    2.0
4    4.0
6    6.0
8    8.0
dtype: float64

a.fillna(437)=0      0.0
1    437.0
2      2.0
3    437.0
4      4.0
5    437.0
6      6.0
7    437.0
8      8.0
dtype: float64
    A   B   C   D   E   F
0  10  37  15  29  15  40
1  13  30   9  19  32   7
2  21  19  22  46  44  44
3  29  22  40  18  29  40 

    A   B   C   D   E   F     2
0  10  37  15  29  15  40   1.0
1  13  30   9  19  32   7   5.0
2  21  19  22  46  44  44  99.0
3  29  22  40  18  29  40   NaN 



# MultiIndex
## DataFrame

In [10]:
# Creating a DataFrame
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.DataFrame(populations, index=newIndex, columns=["Population"])
print("pop=\n{}\n\npop[\"Population\"]=\n{}\n\npop.Population=\n{}".format(pop, pop["Population"], pop.Population))
# print(pop); print("\n"); print(pop["Population"]); print("\n"); print(pop.Population)

pop=
                 Population
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561

pop["Population"]=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64

pop.Population=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64


In [11]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'c'], [1, 2, 3, 4]], columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.090763,0.655261
a,2,0.220197,0.20723
b,3,0.332463,0.077121
c,4,0.753513,0.863642


In [12]:
a = pd.DataFrame(np.random.randint(50, size=(3,6))
                 , index=pd.MultiIndex.from_arrays([["R1", "R2", "R3"], ["R1.1", "R1.2", "R2.3"]], names=["TopRow", "LowerRow"])
                 , columns=pd.MultiIndex.from_arrays([["H1", "H2", "H3", "H4", "H5", "H6"], ["H1.1", "H1.2", "H1.3", "H1.4", "H1.5", "H1.6"]], names=["TopCol", "LowerCol"]))
a

Unnamed: 0_level_0,TopCol,H1,H2,H3,H4,H5,H6
Unnamed: 0_level_1,LowerCol,H1.1,H1.2,H1.3,H1.4,H1.5,H1.6
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
R1,R1.1,7,9,49,44,9,20
R2,R1.2,25,27,29,23,29,49
R3,R2.3,26,45,44,41,31,41


In [13]:
a = pd.DataFrame(np.random.randint(60, size=(6,10))
                 , index=pd.MultiIndex.from_product([["R1", "R2", "R3"], [0,1]], names=["TopRow", "LowerRow"])
                 , columns=pd.MultiIndex.from_product([["H1", "H2", "H3", "H4", "H5"], ['Foo','Doo']], names=["TopCol", "LowerCol"]))
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,42,44,11,12,30,20,54,30,48,7
R1,1,5,25,57,4,0,53,15,0,21,34
R2,0,7,56,45,1,51,50,56,18,30,25
R2,1,33,40,7,58,38,1,6,58,2,54
R3,0,39,38,18,23,24,41,42,15,42,43
R3,1,0,54,0,46,18,50,28,47,55,33


In [14]:
a["H2"]['Doo']

TopRow  LowerRow
R1      0           12
        1            4
R2      0            1
        1           58
R3      0           23
        1           46
Name: Doo, dtype: int32

## Stacking, reset_index

* axis has the same meaning as in the case of numpy arrays
* levels are a new concept using in multiindexed panda arrays:
        * level=0: The upper most level of hierarchy
        * level=1: The hierarchy level one below the upper most one 
        * level=n: The nth level of hierarchy relative to the topmost item
* To represent something uniquely we need the axis and the level at the same time. 
        * Example: axis=0 and level=0, will represent the top most rows
        * Example: axis=1 and level=0, will represent the top most columns
* One can also represent the level in terms of its name. Example: 'TopCol' represents level=0, 'LowerCol' represents level=1.

In [15]:
a.unstack(level=0) # Unstack based on the outermost index or index level 0 (in this case 'TopRow' (R1, R2, and R3))

TopCol,H1,H1,H1,H1,H1,H1,H2,H2,H2,H2,...,H4,H4,H4,H4,H5,H5,H5,H5,H5,H5
LowerCol,Foo,Foo,Foo,Doo,Doo,Doo,Foo,Foo,Foo,Doo,...,Foo,Doo,Doo,Doo,Foo,Foo,Foo,Doo,Doo,Doo
TopRow,R1,R2,R3,R1,R2,R3,R1,R2,R3,R1,...,R3,R1,R2,R3,R1,R2,R3,R1,R2,R3
LowerRow,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,42,7,39,44,56,38,11,45,18,12,...,42,30,18,15,48,30,42,7,25,43
1,5,33,0,25,40,54,57,7,0,4,...,28,0,58,47,21,2,55,34,54,33


In [16]:
a.unstack(level=1)  # Unstack based on the index level 1 (in this case 'LowerRow' (0 and 1))

TopCol,H1,H1,H1,H1,H2,H2,H2,H2,H3,H3,H3,H3,H4,H4,H4,H4,H5,H5,H5,H5
LowerCol,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo,Foo,Foo,Doo,Doo
LowerRow,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
TopRow,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
R1,42,5,44,25,11,57,12,4,30,0,20,53,54,15,30,0,48,21,7,34
R2,7,33,56,40,45,7,1,58,51,38,50,1,56,6,18,58,30,2,25,54
R3,39,0,38,54,18,0,23,46,24,18,41,50,42,28,15,47,42,55,43,33


In [17]:
b = a.reset_index() # Remove the multi-indices and convert them to columns
print(a); print(b) # Notice that after the reset, the 'TopRow' and 'LowerRow' which were reset 
                   # due to the command have no 'LowerCol' (obviously), as they were not defined. 

TopCol           H1      H2      H3      H4      H5    
LowerCol        Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
TopRow LowerRow                                        
R1     0         42  44  11  12  30  20  54  30  48   7
       1          5  25  57   4   0  53  15   0  21  34
R2     0          7  56  45   1  51  50  56  18  30  25
       1         33  40   7  58  38   1   6  58   2  54
R3     0         39  38  18  23  24  41  42  15  42  43
       1          0  54   0  46  18  50  28  47  55  33
TopCol   TopRow LowerRow  H1      H2      H3      H4      H5    
LowerCol                 Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
0            R1        0  42  44  11  12  30  20  54  30  48   7
1            R1        1   5  25  57   4   0  53  15   0  21  34
2            R2        0   7  56  45   1  51  50  56  18  30  25
3            R2        1  33  40   7  58  38   1   6  58   2  54
4            R3        0  39  38  18  23  24  41  42  15  42  43
5            R3        1   0  54   0  46 

In [18]:
# b['TopRow']['Foo'] --> This will result in an error. No 'Foo' defined for 'TopRow' after the reset
b['TopRow'][0]

'R1'

In [19]:
b.set_index(['TopRow', 'LowerRow']) # Setting 'TopRow' and 'LowerRow' as indices again.
                                    # This should result in the same original array
# b.set_index(['H1', 'H2']) --> Will result in an error (Most likely because they have lower indices 'Foo' and 'Doo')

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,42,44,11,12,30,20,54,30,48,7
R1,1,5,25,57,4,0,53,15,0,21,34
R2,0,7,56,45,1,51,50,56,18,30,25
R2,1,33,40,7,58,38,1,6,58,2,54
R3,0,39,38,18,23,24,41,42,15,42,43
R3,1,0,54,0,46,18,50,28,47,55,33


In [20]:
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,42,44,11,12,30,20,54,30,48,7
R1,1,5,25,57,4,0,53,15,0,21,34
R2,0,7,56,45,1,51,50,56,18,30,25
R2,1,33,40,7,58,38,1,6,58,2,54
R3,0,39,38,18,23,24,41,42,15,42,43
R3,1,0,54,0,46,18,50,28,47,55,33


In [21]:
a.mean(level='TopRow') # Get mean of each level of type 'TopRow'

TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
R1,23.5,34.5,34.0,8.0,15.0,36.5,34.5,15.0,34.5,20.5
R2,20.0,48.0,26.0,29.5,44.5,25.5,31.0,38.0,16.0,39.5
R3,19.5,46.0,9.0,34.5,21.0,45.5,35.0,31.0,48.5,38.0


In [22]:
a.mean(axis=1, level='LowerCol') # Take the average of all the 'Foo' sub-columns (level=1)

Unnamed: 0_level_0,LowerCol,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,0,37.0,22.6
R1,1,19.6,23.2
R2,0,37.8,30.0
R2,1,17.2,42.2
R3,0,33.0,32.0
R3,1,20.2,46.0


In [23]:
a.mean(axis=1, level=1) # Same as level='LowerCol'

Unnamed: 0_level_0,LowerCol,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,0,37.0,22.6
R1,1,19.6,23.2
R2,0,37.8,30.0
R2,1,17.2,42.2
R3,0,33.0,32.0
R3,1,20.2,46.0


In [24]:
a.mean(axis=1, level=0) # Taking the average at level=0 and axis=1

Unnamed: 0_level_0,TopCol,H1,H2,H3,H4,H5
TopRow,LowerRow,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
R1,0,43.0,11.5,25.0,42.0,27.5
R1,1,15.0,30.5,26.5,7.5,27.5
R2,0,31.5,23.0,50.5,37.0,27.5
R2,1,36.5,32.5,19.5,32.0,28.0
R3,0,38.5,20.5,32.5,28.5,42.5
R3,1,27.0,23.0,34.0,37.5,44.0


## Concatenation

In [25]:
a

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,42,44,11,12,30,20,54,30,48,7
R1,1,5,25,57,4,0,53,15,0,21,34
R2,0,7,56,45,1,51,50,56,18,30,25
R2,1,33,40,7,58,38,1,6,58,2,54
R3,0,39,38,18,23,24,41,42,15,42,43
R3,1,0,54,0,46,18,50,28,47,55,33


In [26]:
Foo = a['R1':'R2']
Doo = a['R3':'R3']
print(Foo); print(Doo);

TopCol           H1      H2      H3      H4      H5    
LowerCol        Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
TopRow LowerRow                                        
R1     0         42  44  11  12  30  20  54  30  48   7
       1          5  25  57   4   0  53  15   0  21  34
R2     0          7  56  45   1  51  50  56  18  30  25
       1         33  40   7  58  38   1   6  58   2  54
TopCol           H1      H2      H3      H4      H5    
LowerCol        Foo Doo Foo Doo Foo Doo Foo Doo Foo Doo
TopRow LowerRow                                        
R3     0         39  38  18  23  24  41  42  15  42  43
       1          0  54   0  46  18  50  28  47  55  33


In [27]:
pd.concat([Foo, Doo]) # Default, axis=0 (meaning concatenation along rows)

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
R1,0,42,44,11,12,30,20,54,30,48,7
R1,1,5,25,57,4,0,53,15,0,21,34
R2,0,7,56,45,1,51,50,56,18,30,25
R2,1,33,40,7,58,38,1,6,58,2,54
R3,0,39,38,18,23,24,41,42,15,42,43
R3,1,0,54,0,46,18,50,28,47,55,33


In [28]:
pd.concat([Foo,Doo], axis=1) # Concatenation along columns. Observe how the unknown values are replaced with 'NaN'
                             # By default the two DFs 

Unnamed: 0_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
TopRow,LowerRow,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
R1,0,42.0,44.0,11.0,12.0,30.0,20.0,54.0,30.0,48.0,7.0,,,,,,,,,,
R1,1,5.0,25.0,57.0,4.0,0.0,53.0,15.0,0.0,21.0,34.0,,,,,,,,,,
R2,0,7.0,56.0,45.0,1.0,51.0,50.0,56.0,18.0,30.0,25.0,,,,,,,,,,
R2,1,33.0,40.0,7.0,58.0,38.0,1.0,6.0,58.0,2.0,54.0,,,,,,,,,,
R3,0,,,,,,,,,,,39.0,38.0,18.0,23.0,24.0,41.0,42.0,15.0,42.0,43.0
R3,1,,,,,,,,,,,0.0,54.0,0.0,46.0,18.0,50.0,28.0,47.0,55.0,33.0


In [29]:
pd.concat([Foo,Doo], ignore_index=True)

TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
0,42,44,11,12,30,20,54,30,48,7
1,5,25,57,4,0,53,15,0,21,34
2,7,56,45,1,51,50,56,18,30,25
3,33,40,7,58,38,1,6,58,2,54
4,39,38,18,23,24,41,42,15,42,43
5,0,54,0,46,18,50,28,47,55,33


In [30]:
pd.concat([Foo, Doo], keys=["Zoo", "Moo"])

Unnamed: 0_level_0,Unnamed: 1_level_0,TopCol,H1,H1,H2,H2,H3,H3,H4,H4,H5,H5
Unnamed: 0_level_1,Unnamed: 1_level_1,LowerCol,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo,Foo,Doo
Unnamed: 0_level_2,TopRow,LowerRow,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Zoo,R1,0,42,44,11,12,30,20,54,30,48,7
Zoo,R1,1,5,25,57,4,0,53,15,0,21,34
Zoo,R2,0,7,56,45,1,51,50,56,18,30,25
Zoo,R2,1,33,40,7,58,38,1,6,58,2,54
Moo,R3,0,39,38,18,23,24,41,42,15,42,43
Moo,R3,1,0,54,0,46,18,50,28,47,55,33


In [31]:
countries = pd.DataFrame({"Countries": ["India", "US", "China", "Russia", "Brazil"]})
states = pd.DataFrame({"India": ["AP", "Telangana", "MP", "Rajasthan", "UP", "Bihar", "TN"]
                    , "US":["Washington", "California", "Colorado", "NY", "Texas", "NewMexico", "Utah"]
                    , "China": ["One", "Two", "Three", "Four", "Five", "Six", "Seven"]
                    , "Russia": ["One", "Two", "Three", "Four", "Five", "Six", "Seven"]})

pop = pd.DataFrame({"India": [400, 500, 600, 700, 800, 900, 1000]
                    , "US":[1100, 1200, 1300, 1400, 1500, 1600, 1700]
                    , "China": [2100, 2200, 2300, 2400, 2500, 2600, 2700]
                    , "Brazil": [2100, 2200, 2300, 2400, 2500, 2600, 2700]})

'''
res = pd.DataFrame({"Germany": [400, 500, 600, 700, 800, 900, 1000]
                    , "US":[1100, 1200, 1300, 1400, 1500, 1600, 1700]
                    , "China": [2100, 2200, 2300, 2400, 2500, 2600, 2700]})
'''

print(countries); print(states); print(pop)

##################################################################
##################################################################
###############   INCOMPLETE EXAMPLE (FIX THIS)
##################################################################
##################################################################

  Countries
0     India
1        US
2     China
3    Russia
4    Brazil
       India          US  China Russia
0         AP  Washington    One    One
1  Telangana  California    Two    Two
2         MP    Colorado  Three  Three
3  Rajasthan          NY   Four   Four
4         UP       Texas   Five   Five
5      Bihar   NewMexico    Six    Six
6         TN        Utah  Seven  Seven
   India    US  China  Brazil
0    400  1100   2100    2100
1    500  1200   2200    2200
2    600  1300   2300    2300
3    700  1400   2400    2400
4    800  1500   2500    2500
5    900  1600   2600    2600
6   1000  1700   2700    2700


In [32]:
pop = pd.DataFrame({"Countries": ["India", "US", "China", "Russia", "Brazil"]
                         , "States": [10,20,30,40,50]
                         , "Population": [83424234, 234234223, 32664835, 242132425, 4343234244]})
lang = pd.DataFrame({"Countries": ["India", "US", "China", "Russia", "Brazil"]
                   , "Language": ["Hindi", "English", "Chinese", "Russian", "Portugese"]
                   , "Other-imp-Language": ["Telugu", "Spanish", "Chinese", "Urdu", "Spanish"]})

print(pop); print(lang);

  Countries  States  Population
0     India      10    83424234
1        US      20   234234223
2     China      30    32664835
3    Russia      40   242132425
4    Brazil      50  4343234244
  Countries   Language Other-imp-Language
0     India      Hindi             Telugu
1        US    English            Spanish
2     China    Chinese            Chinese
3    Russia    Russian               Urdu
4    Brazil  Portugese            Spanish


In [33]:
pd.merge(pop, lang, on="Countries") # Note the passed arguments, 'pop' and 'lang' are not
                                    # passed as a list but as individual arguments

Unnamed: 0,Countries,States,Population,Language,Other-imp-Language
0,India,10,83424234,Hindi,Telugu
1,US,20,234234223,English,Spanish
2,China,30,32664835,Chinese,Chinese
3,Russia,40,242132425,Russian,Urdu
4,Brazil,50,4343234244,Portugese,Spanish


In [34]:
lang = pd.DataFrame({"Nations": ["India", "US", "China", "Russia", "Brazil"]
                   , "Language": ["Hindi", "English", "Chinese", "Russian", "Portugese"]
                   , "Other-imp-Language": ["Telugu", "Spanish", "Chinese", "Urdu", "Spanish"]})

pd.merge(pop, lang, left_on="Countries", right_on="Nations") # Note here the keywords left_on and right_on. They are used
                                                             # to tell Pandas that the coloumn over which the merge operation
                                                             # will be done is called 'Countries' in the DataFrame 'pop', whereas
                                                             # the DataFrame where 'right_on' is specified the column is 
                                                             # called 'Nations'

Unnamed: 0,Countries,States,Population,Nations,Language,Other-imp-Language
0,India,10,83424234,India,Hindi,Telugu
1,US,20,234234223,US,English,Spanish
2,China,30,32664835,China,Chinese,Chinese
3,Russia,40,242132425,Russia,Russian,Urdu
4,Brazil,50,4343234244,Brazil,Portugese,Spanish


In [35]:
pd.merge(pop, lang, left_on="Countries", right_on="Nations").drop("Nations", axis=1)

Unnamed: 0,Countries,States,Population,Language,Other-imp-Language
0,India,10,83424234,Hindi,Telugu
1,US,20,234234223,English,Spanish
2,China,30,32664835,Chinese,Chinese
3,Russia,40,242132425,Russian,Urdu
4,Brazil,50,4343234244,Portugese,Spanish


## Series

In [36]:
# Creating a Series
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2015)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=newIndex)
print("pop=\n{}\n\npop[:,2015]=\n{}\n\npop.unstack()=\n{}\n".format(pop, pop[:,2015], pop.unstack()))

# Creating a DataFrame using a series
pop_df = pd.DataFrame({'total': pop, 'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
print("pop_df=\n{}\n".format(pop_df))
# Note if the DataFrame has more than 1 columns, then indexing as shown below is not possible
#print("pop_df['Texas',2015]=\n{}".format(pop_df['Texas',2015]))
print("pop['Texas',2015]=\n{}\n".format(pop['Texas',2015]))
print("pop[:,2010]=\n{}\n".format(pop[:,2010]))
print("pop['Texas':'California']=\n{}\n".format(pop['Texas':'California']))
print("pop['California':'Texas']=\n{}\n".format(pop['California':'Texas']))

pop=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2015    25145561
dtype: int64

pop[:,2015]=
Texas    25145561
dtype: int64

pop.unstack()=
                  2000        2010        2015
California  33871648.0  37253956.0         NaN
New York    18976457.0  19378102.0         NaN
Texas       20851820.0         NaN  25145561.0

pop_df=
                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2015  25145561  6879014

pop['Texas',2015]=
25145561

pop[:,2010]=
California    37253956
New York      19378102
dtype: int64

pop['Texas':'California']=
Series([], dtype: int64)

pop['California':'Texas']=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas 

# Recipes

In [37]:
a = pd.DataFrame([34,None,56,76,None,212,56,None], index=["A","B","C","D","E",None,"F","NA"], columns=['Indices'])
print(a)

     Indices
A       34.0
B        NaN
C       56.0
D       76.0
E        NaN
NaN    212.0
F       56.0
NA       NaN
