In [1]:
import numpy as np
import pandas as pd

# Series

In [2]:
# Trivial Series data object with default indices
a = pd.Series([34,23,56,76,212,56,776])
print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))

a=0     34
1     23
2     56
3     76
4    212
5     56
6    776
dtype: int64
Index=RangeIndex(start=0, stop=7, step=1), Values=[ 34  23  56  76 212  56 776]

a[3]=76
Slicing is also same as in numpy a[2:6]=2     56
3     76
4    212
5     56
dtype: int64


In [3]:
# Series object with custom indices
a = pd.Series([34,23,56,76,212,56,776], index=["A","B","C","D","E","F","NA"])

print("a={}".format(a))
print("Index={}, Values={}\n".format(a.index, a.values))

# Accessing individual elements of a series with default indices
print("a[3]={}".format(a[3]))
print("a[NA]={}".format(a["NA"]))
print("Slicing is also same as in numpy a[2:6]={}".format(a[2:6]))
print("Slicing is also same as in numpy a[B:E]={}".format(a["B":"E"]))

a=A      34
B      23
C      56
D      76
E     212
F      56
NA    776
dtype: int64
Index=Index(['A', 'B', 'C', 'D', 'E', 'F', 'NA'], dtype='object'), Values=[ 34  23  56  76 212  56 776]

a[3]=76
a[NA]=776
Slicing is also same as in numpy a[2:6]=C     56
D     76
E    212
F     56
dtype: int64
Slicing is also same as in numpy a[B:E]=B     23
C     56
D     76
E    212
dtype: int64


# DataFrame

In [4]:
corigin = pd.Series(["egypt","india","pakistan", "china","india", "tibet","us"], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
length = pd.Series([10434,453445,2132,324325,1213,343,5435], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
ncountries = pd.Series([3,6,2,8,9,2,1], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
significance = pd.Series([10,9,8,7,6,5,4], index=["nile","ganga","indus","brahmaputra","yangtze","mekong","mississippi"])
test = pd.Series([10,9,6,5,4], index=["nile","ganga","yangtze","mekong","mississippi"]) # Notice in sufficient data in this column

rivers=pd.DataFrame({"Origin":corigin, "Length":length, "Num-Countries":ncountries, "Local-Significance":significance, "Test":test})
print(rivers)

# Accessing all the indices and values
print("Indices={}\nValues=\n{}\n".format(rivers.index, rivers.values))
print("rivers['Origin']=\n{}\n".format(rivers['Origin']))

# Access part of the dataframe
print("rivers['ganga':'nile']=\n{}\n".format(rivers['ganga':'nile']))

# Accessing via a mask
print("rivers[rivers.Length > 343]=\n{}\n\n".format(rivers[rivers.Length > 343]))

# Accessing an entire row
print("row 'ganga' of rivers=\n{}".format(rivers.T['ganga']))

               Origin  Length  Num-Countries  Local-Significance  Test
brahmaputra     china  324325              8                   7   NaN
ganga           india  453445              6                   9   9.0
indus        pakistan    2132              2                   8   NaN
mekong          tibet     343              2                   5   5.0
mississippi        us    5435              1                   4   4.0
nile            egypt   10434              3                  10  10.0
yangtze         india    1213              9                   6   6.0
Indices=Index(['brahmaputra', 'ganga', 'indus', 'mekong', 'mississippi', 'nile',
       'yangtze'],
      dtype='object')
Values=
[['china' 324325 8 7 nan]
 ['india' 453445 6 9 9.0]
 ['pakistan' 2132 2 8 nan]
 ['tibet' 343 2 5 5.0]
 ['us' 5435 1 4 4.0]
 ['egypt' 10434 3 10 10.0]
 ['india' 1213 9 6 6.0]]

rivers['Origin']=
brahmaputra       china
ganga             india
indus          pakistan
mekong            tibet
mississippi 

In [5]:
a = pd.DataFrame(np.random.randint(1,100,size=(2,4)), columns=list('ABCD'))
a

Unnamed: 0,A,B,C,D
0,9,58,14,15
1,99,39,9,26


## Addition

In [6]:
a = pd.DataFrame(np.random.randint(1,20,size=(2,3)), columns=['A1', 'A2', 'A3'])
b = pd.DataFrame(np.random.randint(1,20,size=(5,3)))
print("a=\n{}\n\nb=\n{}\n\na+b=\n{}\n\n".format(a,b,a+b))

b = pd.DataFrame(np.random.randint(1,20,size=(5,3)), columns=['A1', 'A2', 'A3'])
print("b=\n{}\n\na+b=\n{}\n\na+b(with zero filling)=\n{}".format(b,a+b,a.add(b,fill_value=0)))

a=
   A1  A2  A3
0  17   8  17
1  13   1   1

b=
    0   1   2
0   2  17  19
1  13  19  18
2  10  19  11
3  12   6  18
4   4  16  19

a+b=
   A1  A2  A3   0   1   2
0 NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN


b=
   A1  A2  A3
0   9  15   3
1   7  18  12
2  16   8  14
3  13  19  18
4  10   4   3

a+b=
     A1    A2    A3
0  26.0  23.0  20.0
1  20.0  19.0  13.0
2   NaN   NaN   NaN
3   NaN   NaN   NaN
4   NaN   NaN   NaN

a+b(with zero filling)=
     A1    A2    A3
0  26.0  23.0  20.0
1  20.0  19.0  13.0
2  16.0   8.0  14.0
3  13.0  19.0  18.0
4  10.0   4.0   3.0


## From numpy arrays

In [7]:
a = pd.DataFrame(np.random.rand(3,2), columns=["A", "B"], index=["ID1","ID2","ID3"])
b = pd.DataFrame(np.random.randint(6,size=(3,2)), columns=["A", "B"], index=["ID1","ID2","ID3"])
print("a=\n{}\n\nb=\n{}\n".format(a,b))
print("a['A']=\n{}".format(a["A"])) # Note the difference to the structured arrays of numpy. In case of structured
                                    # arrays of numpy, the rows can be accessed by indexing where as in this case
                                    # it is not possible

# Note pandas index object is immutable. They can be accessed but cannot be modified
i = a.index
print("\nIndices of a=\n{}".format(i))
print("Accessing an index is fine, i[2]={}".format(i[2]))
# But changing an index value is not allowed ... The following will give an error
#i[2] = "New Label"

a=
            A         B
ID1  0.290654  0.808280
ID2  0.685215  0.026085
ID3  0.843341  0.442259

b=
     A  B
ID1  4  5
ID2  1  1
ID3  3  1

a['A']=
ID1    0.290654
ID2    0.685215
ID3    0.843341
Name: A, dtype: float64

Indices of a=
Index(['ID1', 'ID2', 'ID3'], dtype='object')
Accessing an index is fine, i[2]=ID3


# Accessing using indexes: loc, iloc

In [8]:
# Series data
a = pd.Series([34,23,56,76,212,56,776], index=["A","B","C","D","E","F","NA"])
print(a)
# Notice the 'i' in iloc which stands for implicit
print("a.loc[C]={}, a.iloc[2]={}".format(a.loc["C"], a.iloc[2]))

A      34
B      23
C      56
D      76
E     212
F      56
NA    776
dtype: int64
a.loc[C]=56, a.iloc[2]=56


# Dealing with NaN values

In [9]:
# Example-1: Series data
a = pd.Series(range(9))
for idx in [1,3,5,7]:
    a[idx] = np.nan

print("a={}\n\na.dropna()={}\n\na.fillna(437)={}".format(a, a.dropna(), a.fillna(437)))


# Example-2: DataFrames
a = pd.DataFrame(np.random.randint(1,50,size=(4,6)), columns=['A', 'B', 'C', 'D', 'E', 'F'])
print("\n\na={}".format(a))
a[2] = pd.Series([1, 5, 99, np.nan, 33, np.nan])
print("\n\na={}".format(a))

a=0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
5    NaN
6    6.0
7    NaN
8    8.0
dtype: float64

a.dropna()=0    0.0
2    2.0
4    4.0
6    6.0
8    8.0
dtype: float64

a.fillna(437)=0      0.0
1    437.0
2      2.0
3    437.0
4      4.0
5    437.0
6      6.0
7    437.0
8      8.0
dtype: float64


a=    A   B   C   D   E   F
0  24   6  21  37   7  13
1  20  39  42  48  16  11
2  24   1  26  43   3  49
3  31  31  10  31   3  48


a=    A   B   C   D   E   F     2
0  24   6  21  37   7  13   1.0
1  20  39  42  48  16  11   5.0
2  24   1  26  43   3  49  99.0
3  31  31  10  31   3  48   NaN


# MultiIndex
## DataFrame

In [10]:
# Creating a DataFrame
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.DataFrame(populations, index=newIndex, columns=["Population"])
print("pop=\n{}\n\npop[\"Population\"]=\n{}\n\npop.Population=\n{}".format(pop, pop["Population"], pop.Population))

pop=
                 Population
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561

pop["Population"]=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64

pop.Population=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: Population, dtype: int64


In [19]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'c'], [1, 2, 3, 4]], columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.971156,0.438832
a,2,0.488923,0.38973
b,3,0.897265,0.972312
c,4,0.685634,0.898333


## Series

In [46]:
# Creating a Series
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2015)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
newIndex = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=newIndex)
print("pop=\n{}\n\npop[:,2015]=\n{}\n\npop.unstack()=\n{}\n".format(pop, pop[:,2015], pop.unstack()))

# Creating a DataFrame using a series
pop_df = pd.DataFrame({'total': pop, 'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
print("pop_df=\n{}\n".format(pop_df))
# Note if the DataFrame has more than 1 columns, then indexing as shown below is not possible
#print("pop_df['Texas',2015]=\n{}".format(pop_df['Texas',2015]))
print("pop['Texas',2015]=\n{}\n".format(pop['Texas',2015]))
print("pop['Texas':'California']=\n{}\n".format(pop['Texas':'California']))
print("pop['California':'Texas']=\n{}\n".format(pop['California':'Texas']))

pop=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2015    25145561
dtype: int64

pop[:,2015]=
Texas    25145561
dtype: int64

pop.unstack()=
                  2000        2010        2015
California  33871648.0  37253956.0         NaN
New York    18976457.0  19378102.0         NaN
Texas       20851820.0         NaN  25145561.0

pop_df=
                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2015  25145561  6879014

pop['Texas',2015]=
25145561

pop['Texas':'California']=
Series([], dtype: int64)

pop['California':'Texas']=
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2015    25145561
dtype: int64

