### series as dictionary

In [1]:
import pandas as pd

In [3]:
data=pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [4]:
data['b']

0.5

In [5]:
'a'in data

True

In [6]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [8]:
data['e']=1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [9]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [11]:
data[['a','c']]

a    0.25
c    0.75
dtype: float64

### indexers

In [12]:
data=pd.Series(['a','b','c'],index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [13]:
data[1]

'a'

In [14]:
data[1:3]

3    b
5    c
dtype: object

In [15]:
data.loc[1]

'a'

In [16]:
data.loc[1:3]

1    a
3    b
dtype: object

In [17]:
data.iloc[1]

'b'

### dataFrame as a dictionary

In [18]:
area=pd.Series({'California':423967,'Texas':695662,'New York':141297,'Florida':170312,'Illinois':149995})

In [19]:
pop=pd.Series({'California':38332521,'Texas':26448193,'New York':19651127,'Florida':19552860,'Illinois':12882135})

In [20]:
data=pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
California,423967,423967
Texas,695662,695662
New York,141297,141297
Florida,170312,170312
Illinois,149995,149995


In [21]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [22]:
data.area     #both are same

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [23]:
data.area is data['area']

True

In [24]:
data.pop is data['pop']

False

In [27]:
data['density']=data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423967,423967,1.0
Texas,695662,695662,1.0
New York,141297,141297,1.0
Florida,170312,170312,1.0
Illinois,149995,149995,1.0


In [28]:
data.values

array([[4.23967e+05, 4.23967e+05, 1.00000e+00],
       [6.95662e+05, 6.95662e+05, 1.00000e+00],
       [1.41297e+05, 1.41297e+05, 1.00000e+00],
       [1.70312e+05, 1.70312e+05, 1.00000e+00],
       [1.49995e+05, 1.49995e+05, 1.00000e+00]])

In [29]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,423967.0,695662.0,141297.0,170312.0,149995.0
density,1.0,1.0,1.0,1.0,1.0


In [30]:
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,170312,170312,1.0
Illinois,149995,149995,1.0


In [33]:
data.iloc[0,2]=90
data

Unnamed: 0,area,pop,density
California,423967,423967,90.0
Texas,695662,695662,1.0
New York,141297,141297,1.0
Florida,170312,170312,1.0
Illinois,149995,149995,1.0


### index preservation

In [34]:
import pandas as pd
import numpy as np

In [35]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.randint(1,10,4))
ser

0    7
1    4
2    8
3    5
dtype: int32

In [36]:
df=pd.DataFrame(rng.randint(0,10,(3,4)),columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


### index alignment in series

In [37]:
area=pd.Series({'California':423967,'Texas':695662,'New York':141297,'Florida':170312,'Illinois':149995})

In [38]:
population=pd.Series({'California':38332521,'Texas':26448193,'New York':19651127,'Florida':19552860,'Illinois':12882135})

In [39]:
population/area

California     90.413926
Texas          38.018740
New York      139.076746
Florida       114.806121
Illinois       85.883763
dtype: float64

### operation between DataFrame and Series

In [40]:
A=rng.randint(10,size=(3,4))
A

array([[1, 7, 5, 1],
       [4, 0, 9, 5],
       [8, 0, 9, 2]])

In [41]:
A-A[0]

array([[ 0,  0,  0,  0],
       [ 3, -7,  4,  4],
       [ 7, -7,  4,  1]])

In [42]:
df=pd.DataFrame(A,columns=list('QRST'))
df-df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,3,-7,4,4
2,7,-7,4,1


In [44]:
df.subtract(df['R'],axis=0)

Unnamed: 0,Q,R,S,T
0,-6,0,-2,-6
1,4,0,9,5
2,8,0,9,2


In [45]:
halfrow=df.iloc[0,::2]
halfrow

Q    1
S    5
Name: 0, dtype: int32

In [46]:
df-halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,3.0,,4.0,
2,7.0,,4.0,


### missing data

In [47]:
vals1=np.array([1,None,3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [48]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [49]:
# it shows that addition is not possible for a missing value and integer

In [50]:
vals2=np.array([1,np.nan,3,4])
vals2.dtype

dtype('float64')

In [51]:
1+np.nan

nan

In [52]:
0*np.nan

nan

In [53]:
vals2.sum(),vals2.min(),vals2.max()

(nan, nan, nan)

### difference between NaN and none

In [54]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [57]:
x=pd.Series(range(2),dtype=int)
x

0    0
1    1
dtype: int32

In [58]:
x[0]=None
x

0    NaN
1    1.0
dtype: float64

### deleting null values

In [59]:
data=pd.Series([1,np.nan,'hello',None])

In [60]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [61]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [62]:
data.dropna()

0        1
2    hello
dtype: object

In [63]:
ser1=pd.Series(['A','B','C'],index=[1,2,3])
ser2=pd.Series(['D','E','F'],index=[4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [67]:
def make_df(cols,ind):data={c:[str(c)+str(i) for i in ind] for c in cols}
make_df('ABC',range(3))
df1=make_df('AB',[1,2])
df2=make_df('AB',[3,4])
display('df1','df2','pd.concat([df1,df2])')

'df1'

'df2'

'pd.concat([df1,df2])'

In [68]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [69]:
ser.sum()

2.811925491708157

In [70]:
ser.mean()

0.5623850983416314

### query() and eval()

In [73]:
x=rng.rand(1000000)
y=rng.rand(1000000)
%timeit x+y

11.4 ms ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
mask=(x>0.5)&(y<0.5)

In [75]:
temp1=(x>0.5)
temp2=(y<0.5)
mask=temp1 & temp2

In [76]:
df=pd.DataFrame(rng.rand(1000,3),columns=['A','B','C'])
df.head()

Unnamed: 0,A,B,C
0,0.990797,0.654224,0.820193
1,0.904737,0.137785,0.155398
2,0.30714,0.921909,0.707242
3,0.528478,0.318383,0.546101
4,0.84564,0.760747,0.202263


In [77]:
result1=(df['A']+df['B'])/(df['C']-1)
result3=df.eval('(A+B)/(C-1)')
np.allclose(result1,result3)

True