**Pandas**

In [79]:
import pandas
pandas.__version__

'2.2.2'

**Introducing pandas objects**

In [80]:
import numpy as np
import pandas as pd


**Series**

In [81]:
counts=pd.Series([426,1256,964,135])
counts

0     426
1    1256
2     964
3     135
dtype: int64

In [82]:
counts.values

array([ 426, 1256,  964,  135])

In [83]:
counts.index

RangeIndex(start=0, stop=4, step=1)

**Assigning labels to index**

In [84]:
bacteria=pd.Series([426,1256,964,135],
    index=['Firmicutes','probacteria','Actinobacteria','Bacteriodetes'])
bacteria

Firmicutes         426
probacteria       1256
Actinobacteria     964
Bacteriodetes      135
dtype: int64

In [85]:
print(bacteria['Actinobacteria'])

964


In [86]:
print(bacteria[0])

426


  print(bacteria[0])


In [87]:
bacteria.name='counts'
bacteria.index.name='phylum'
bacteria

phylum
Firmicutes         426
probacteria       1256
Actinobacteria     964
Bacteriodetes      135
Name: counts, dtype: int64

In [88]:
np.log(bacteria)

phylum
Firmicutes        6.054439
probacteria       7.135687
Actinobacteria    6.871091
Bacteriodetes     4.905275
Name: counts, dtype: float64

In [89]:
bacteria[bacteria>1000]

phylum
probacteria    1256
Name: counts, dtype: int64

In [90]:
bacteria_dict={'Firmicutes':426,'probacteria':1256,'Actinobacteria':964,'Bacteriodetes':135}
print(bacteria_dict)

{'Firmicutes': 426, 'probacteria': 1256, 'Actinobacteria': 964, 'Bacteriodetes': 135}


In [91]:
pd.Series(bacteria_dict)

Firmicutes         426
probacteria       1256
Actinobacteria     964
Bacteriodetes      135
dtype: int64

**DataFrame:bi-dimensional series with two(or more)indices**

In [184]:
data={"province":["FL","FL","NH","NH","ZH"],
      "year":[2013,2014,2013,2014,2014],
      "Literacy":[0.2,0.1,0.5,0.3,0.5]}
print(data)
data=pd.DataFrame(data)
data

{'province': ['FL', 'FL', 'NH', 'NH', 'ZH'], 'year': [2013, 2014, 2013, 2014, 2014], 'Literacy': [0.2, 0.1, 0.5, 0.3, 0.5]}


Unnamed: 0,province,year,Literacy
0,FL,2013,0.2
1,FL,2014,0.1
2,NH,2013,0.5
3,NH,2014,0.3
4,ZH,2014,0.5


In [185]:
df=pd.DataFrame(data,columns=["Literacy","province","year"])
df

Unnamed: 0,Literacy,province,year
0,0.2,FL,2013
1,0.1,FL,2014
2,0.5,NH,2013
3,0.3,NH,2014
4,0.5,ZH,2014


In [186]:
df['nonsense']=df.year/df.Literacy
df

Unnamed: 0,Literacy,province,year,nonsense
0,0.2,FL,2013,10065.0
1,0.1,FL,2014,20140.0
2,0.5,NH,2013,4026.0
3,0.3,NH,2014,6713.333333
4,0.5,ZH,2014,4028.0


In [183]:
df['Serie_aligned']=pd.Series(range(5),index=[0,1,2,3,4])
df

{'aa': {0: 1.0, 1: nan},
 'bb': {0: 2, 1: 3},
 'cc': {0: nan, 1: 6.0},
 'Serie_aligned': 0    0
 1    1
 2    2
 3    3
 4    4
 dtype: int64}

In [190]:
df.to_dict()

{'Literacy': {0: 0.2, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.5},
 'province': {0: 'FL', 1: 'FL', 2: 'NH', 3: 'NH', 4: 'ZH'},
 'year': {0: 2013, 1: 2014, 2: 2013, 3: 2014, 4: 2014},
 'nonsense': {0: 10065.0,
  1: 20140.0,
  2: 4026.0,
  3: 6713.333333333334,
  4: 4028.0}}

In [191]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Literacy,province,year,nonsense
0,0.2,FL,2013,10065.0
1,0.1,FL,2014,20140.0
2,0.5,NH,2013,4026.0
3,0.3,NH,2014,6713.333333
4,0.5,ZH,2014,4028.0


**DataFrame as specialised dictionary**

**from a list of dicts**

In [153]:
data=[{'a':i,'b':10*i}for i in range(6)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 10}, {'a': 2, 'b': 20}, {'a': 3, 'b': 30}, {'a': 4, 'b': 40}, {'a': 5, 'b': 50}]


Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50


In [154]:
df=pd.DataFrame([{'aa':1,'bb':2},{'bb':3,'cc':6}])
df=df.to_dict()
df

{'aa': {0: 1.0, 1: nan}, 'bb': {0: 2, 1: 3}, 'cc': {0: nan, 1: 6.0}}

**from a two_dimensional numpy array**

In [155]:
pd.DataFrame(np.random.randint(2,12),
             columns=['foo','bar'],
             index=['a','b','c'])

Unnamed: 0,foo,bar
a,8,8
b,8,8
c,8,8


**The pandas Index Object**

In [156]:
ind=pd.Index([20,34,57,7,1,8])
ind

Index([20, 34, 57, 7, 1, 8], dtype='int64')

**Index as immutable array**

In [157]:
ind[1]

np.int64(34)

In [158]:
ind[1:4:2]

Index([34, 7], dtype='int64')

In [159]:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

6 (6,) 1 int64


In [160]:
ind[1]=0

TypeError: Index does not support mutable operations

**Operating on Data in Pandas**

**Ufuncs:Index Preservation**


In [161]:
rng=np.random.RandomState(15)
ser=pd.Series(rng.randint(0,10,4))
ser

0    8
1    5
2    5
3    7
dtype: int32

In [162]:
dfr=pd.DataFrame(rng.randint(0,10,(5,4)),
                 columns=['A','B','C','D'])
dfr

Unnamed: 0,A,B,C,D
0,0,7,5,6
1,1,7,0,4
2,9,7,5,3
3,6,8,2,1
4,1,0,5,2


In [163]:
np.exp(ser)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
dtype: float64

In [164]:
np.sin(dfr*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


In [165]:
np.sin(dfr*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


**Universal Functions:Index Alignment**

**Index alignment in series**

In [166]:
area=pd.Series({'Alaska':1783458,'Texas':675349,
                'California':423785},name='area')
population=pd.Series({'Alaska':12564527,'Texas':68652149,
                'California':56924813},name='population')
print(area)
print(population)

Alaska        1783458
Texas          675349
California     423785
Name: area, dtype: int64
Alaska        12564527
Texas         68652149
California    56924813
Name: population, dtype: int64


In [167]:
population/area

Alaska          7.045037
Texas         101.654328
California    134.324747
dtype: float64

In [168]:
population.index.union(area.index)

Index(['Alaska', 'Texas', 'California'], dtype='object')

In [169]:
A=pd.Series([2,4,6],index=[0,1,2])
B=pd.Series([1,3,5],index=[1,2,3])
print(A)
print(B)
A+B

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [170]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

**Data wrangling**

In [187]:
#merge operation
df


Unnamed: 0,Literacy,province,year,nonsense
0,0.2,FL,2013,10065.0
1,0.1,FL,2014,20140.0
2,0.5,NH,2013,4026.0
3,0.3,NH,2014,6713.333333
4,0.5,ZH,2014,4028.0


In [188]:
df2=pd.DataFrame({"province":["FL","NH","ZH"],"population":["100000","200000","300000"]})
df2

Unnamed: 0,province,population
0,FL,100000
1,NH,200000
2,ZH,300000


In [189]:
df.merge(df2)

Unnamed: 0,Literacy,province,year,nonsense,population
0,0.2,FL,2013,10065.0,100000
1,0.1,FL,2014,20140.0,100000
2,0.5,NH,2013,4026.0,200000
3,0.3,NH,2014,6713.333333,200000
4,0.5,ZH,2014,4028.0,300000


In [192]:
df3=pd.DataFrame({"province":["FL","NH"],"population":["100000","200000"]})
df3
df.merge(df3,right_on='province',left_on='province')

Unnamed: 0,Literacy,province,year,nonsense,population
0,0.2,FL,2013,10065.0,100000
1,0.1,FL,2014,20140.0,100000
2,0.5,NH,2013,4026.0,200000
3,0.3,NH,2014,6713.333333,200000


In [193]:
df4=pd.DataFrame({"province":["FL","NH","UT"],"population":["100000","200000","50000"]})
df.merge(df4,how='outer')

Unnamed: 0,Literacy,province,year,nonsense,population
0,0.2,FL,2013.0,10065.0,100000.0
1,0.1,FL,2014.0,20140.0,100000.0
2,0.5,NH,2013.0,4026.0,200000.0
3,0.3,NH,2014.0,6713.333333,200000.0
4,,UT,,,50000.0
5,0.5,ZH,2014.0,4028.0,


In [196]:
df5=pd.DataFrame({"province":["FL","NH","FL"],"population":["100000","200000","50000"]})
print(df)
df.merge(df5,how='outer')

   Literacy province  year      nonsense
0       0.2       FL  2013  10065.000000
1       0.1       FL  2014  20140.000000
2       0.5       NH  2013   4026.000000
3       0.3       NH  2014   6713.333333
4       0.5       ZH  2014   4028.000000


Unnamed: 0,Literacy,province,year,nonsense,population
0,0.2,FL,2013,10065.0,100000.0
1,0.2,FL,2013,10065.0,50000.0
2,0.1,FL,2014,20140.0,100000.0
3,0.1,FL,2014,20140.0,50000.0
4,0.5,NH,2013,4026.0,200000.0
5,0.3,NH,2014,6713.333333,200000.0
6,0.5,ZH,2014,4028.0,


**Combining data with overlap**

In [199]:
serie_a=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],
                  index=['f','e','d','c','b','a'])
serie_b=pd.Series(np.arange(len(serie_a),dtype=np.float64),
                  index=['f','e','d','c','b','a'])

In [200]:
serie_a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [201]:
serie_b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [204]:
pd.Series(np.where(pd.isnull(serie_a),serie_b,serie_a),index=serie_a.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [205]:
serie_a.combine_first(serie_b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64