<a href="https://colab.research.google.com/github/ramank123/Pandas/blob/main/Working_with_missing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)


df["four"] = "bar"

df["five"] = df["one"] > 0

In [3]:
df

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True


In [4]:
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])

In [5]:
df2

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
b,,,,,
c,0.371086,0.079907,1.311991,bar,True
d,,,,,
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
g,,,,,
h,0.763561,0.10891,1.607601,bar,True


In [6]:
df2["one"]

a    0.127331
b         NaN
c    0.371086
d         NaN
e    1.732251
f    0.882516
g         NaN
h    0.763561
Name: one, dtype: float64

In [7]:
pd.isna(df2["one"])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [8]:
df2["four"].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [9]:
df2.isna()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [10]:
df2["one"] == np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

In [11]:
pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [12]:
df2 = df.copy()
df2

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True


In [13]:
df2["timestamp"] = pd.Timestamp("20120101")

In [14]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,0.127331,1.80989,0.228337,bar,True,2012-01-01
c,0.371086,0.079907,1.311991,bar,True,2012-01-01
e,1.732251,-1.299421,-0.577933,bar,True,2012-01-01
f,0.882516,0.755002,0.938911,bar,True,2012-01-01
h,0.763561,0.10891,1.607601,bar,True,2012-01-01


In [15]:
df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan

df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,1.80989,0.228337,bar,True,NaT
c,,0.079907,1.311991,bar,True,NaT
e,1.732251,-1.299421,-0.577933,bar,True,2012-01-01
f,0.882516,0.755002,0.938911,bar,True,2012-01-01
h,,0.10891,1.607601,bar,True,NaT


In [16]:
df2.dtypes.value_counts()

float64           3
object            1
bool              1
datetime64[ns]    1
dtype: int64

In [17]:
s = pd.Series([1, 2, 3])
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [18]:
s = pd.Series(["a", "b", "c"])
s.loc[0] = None
s.loc[1] = np.nan
s

0    None
1     NaN
2       c
dtype: object

In [19]:
df

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True


In [20]:
df["one"].sum()

3.8767452676502723

In [21]:
df.mean(1)

  """Entry point for launching an IPython kernel.


a    0.791389
c    0.690746
e    0.213724
f    0.894107
h    0.870018
dtype: float64

In [22]:
df.cumsum()

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,1
c,0.498417,1.889797,1.540328,barbar,2
e,2.230668,0.590376,0.962395,barbarbar,3
f,3.113184,1.345377,1.901306,barbarbarbar,4
h,3.876745,1.454288,3.508907,barbarbarbarbar,5


In [23]:
df.cumsum(skipna=False)

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,1
c,0.498417,1.889797,1.540328,barbar,2
e,2.230668,0.590376,0.962395,barbarbar,3
f,3.113184,1.345377,1.901306,barbarbarbar,4
h,3.876745,1.454288,3.508907,barbarbarbarbar,5


In [24]:
pd.Series([np.nan]).sum()

0.0

In [25]:
pd.Series([], dtype="float64").prod()

1.0

In [26]:
df

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True


In [27]:
df.groupby("one").mean()

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.127331,1.80989,0.228337,1.0
0.371086,0.079907,1.311991,1.0
0.763561,0.10891,1.607601,1.0
0.882516,0.755002,0.938911,1.0
1.732251,-1.299421,-0.577933,1.0


In [28]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,1.80989,0.228337,bar,True,NaT
c,,0.079907,1.311991,bar,True,NaT
e,1.732251,-1.299421,-0.577933,bar,True,2012-01-01
f,0.882516,0.755002,0.938911,bar,True,2012-01-01
h,,0.10891,1.607601,bar,True,NaT


In [29]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,timestamp
a,0.0,1.80989,0.228337,bar,True,0
c,0.0,0.079907,1.311991,bar,True,0
e,1.732251,-1.299421,-0.577933,bar,True,2012-01-01 00:00:00
f,0.882516,0.755002,0.938911,bar,True,2012-01-01 00:00:00
h,0.0,0.10891,1.607601,bar,True,0


In [30]:
df2["one"].fillna("missing")

a     missing
c     missing
e    1.732251
f    0.882516
h     missing
Name: one, dtype: object

In [31]:
df

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True


In [32]:
dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff

Unnamed: 0,A,B,C
0,0.251466,-0.908926,-0.58256
1,0.068248,-0.449031,1.55036
2,0.291559,-0.312353,0.314748
3,,1.933693,1.738094
4,,,-0.073807
5,0.053037,,
6,-1.566602,-0.040667,
7,0.197223,0.96494,
8,0.35646,0.748786,0.221801
9,-0.0137,-2.338043,0.151998


In [33]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,0.251466,-0.908926,-0.58256
1,0.068248,-0.449031,1.55036
2,0.291559,-0.312353,0.314748
3,-0.045289,1.933693,1.738094
4,-0.045289,-0.0502,-0.073807
5,0.053037,-0.0502,0.474376
6,-1.566602,-0.040667,0.474376
7,0.197223,0.96494,0.474376
8,0.35646,0.748786,0.221801
9,-0.0137,-2.338043,0.151998


In [34]:
dff.fillna(dff.mean()["B":"C"])

Unnamed: 0,A,B,C
0,0.251466,-0.908926,-0.58256
1,0.068248,-0.449031,1.55036
2,0.291559,-0.312353,0.314748
3,,1.933693,1.738094
4,,-0.0502,-0.073807
5,0.053037,-0.0502,0.474376
6,-1.566602,-0.040667,0.474376
7,0.197223,0.96494,0.474376
8,0.35646,0.748786,0.221801
9,-0.0137,-2.338043,0.151998


In [35]:
df

Unnamed: 0,one,two,three,four,five
a,0.127331,1.80989,0.228337,bar,True
c,0.371086,0.079907,1.311991,bar,True
e,1.732251,-1.299421,-0.577933,bar,True
f,0.882516,0.755002,0.938911,bar,True
h,0.763561,0.10891,1.607601,bar,True
