<a href="https://colab.research.google.com/github/ramank123/Pandas/blob/main/Working_with_missing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)


df["four"] = "bar"

df["five"] = df["one"] > 0

In [4]:
df

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [5]:
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])

In [6]:
df2

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
b,,,,,
c,1.006688,-0.753678,0.300014,bar,True
d,,,,,
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
g,,,,,
h,0.36544,2.100392,1.408104,bar,True


In [7]:
df2["one"]

a   -1.364568
b         NaN
c    1.006688
d         NaN
e    0.237258
f   -0.295800
g         NaN
h    0.365440
Name: one, dtype: float64

In [8]:
pd.isna(df2["one"])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [9]:
df2["four"].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [10]:
df2.isna()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [11]:
df2["one"] == np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

In [12]:
pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [13]:
df2 = df.copy()
df2

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [14]:
df2["timestamp"] = pd.Timestamp("20120101")

In [15]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,-1.364568,-1.399343,-0.709584,bar,False,2012-01-01
c,1.006688,-0.753678,0.300014,bar,True,2012-01-01
e,0.237258,-0.627598,-0.313282,bar,True,2012-01-01
f,-0.2958,1.380749,-1.515091,bar,False,2012-01-01
h,0.36544,2.100392,1.408104,bar,True,2012-01-01


In [16]:
df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan

df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-1.399343,-0.709584,bar,False,NaT
c,,-0.753678,0.300014,bar,True,NaT
e,0.237258,-0.627598,-0.313282,bar,True,2012-01-01
f,-0.2958,1.380749,-1.515091,bar,False,2012-01-01
h,,2.100392,1.408104,bar,True,NaT


In [17]:
df2.dtypes.value_counts()

float64           3
object            1
bool              1
datetime64[ns]    1
dtype: int64

In [18]:
s = pd.Series([1, 2, 3])
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [20]:
s = pd.Series(["a", "b", "c"])
s.loc[0] = None
s.loc[1] = np.nan
s

0    None
1     NaN
2       c
dtype: object

In [22]:
df

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [23]:
df["one"].sum()

-0.050983653711757726

In [24]:
df.mean(1)

  """Entry point for launching an IPython kernel.


a   -0.868374
c    0.388256
e    0.074095
f   -0.107536
h    1.218484
dtype: float64

In [25]:
df.cumsum()

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,0
c,-0.357881,-2.153021,-0.40957,barbar,1
e,-0.120623,-2.780619,-0.722851,barbarbar,2
f,-0.416423,-1.399869,-2.237942,barbarbarbar,2
h,-0.050984,0.700523,-0.829838,barbarbarbarbar,3


In [26]:
df.cumsum(skipna=False)

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,0
c,-0.357881,-2.153021,-0.40957,barbar,1
e,-0.120623,-2.780619,-0.722851,barbarbar,2
f,-0.416423,-1.399869,-2.237942,barbarbarbar,2
h,-0.050984,0.700523,-0.829838,barbarbarbarbar,3


In [27]:
pd.Series([np.nan]).sum()

0.0

In [28]:
pd.Series([], dtype="float64").prod()

1.0

In [29]:
df

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [30]:
df.groupby("one").mean()

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.364568,-1.399343,-0.709584,0.0
-0.2958,1.380749,-1.515091,0.0
0.237258,-0.627598,-0.313282,1.0
0.36544,2.100392,1.408104,1.0
1.006688,-0.753678,0.300014,1.0


In [31]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-1.399343,-0.709584,bar,False,NaT
c,,-0.753678,0.300014,bar,True,NaT
e,0.237258,-0.627598,-0.313282,bar,True,2012-01-01
f,-0.2958,1.380749,-1.515091,bar,False,2012-01-01
h,,2.100392,1.408104,bar,True,NaT


In [32]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,timestamp
a,0.0,-1.399343,-0.709584,bar,False,0
c,0.0,-0.753678,0.300014,bar,True,0
e,0.237258,-0.627598,-0.313282,bar,True,2012-01-01 00:00:00
f,-0.2958,1.380749,-1.515091,bar,False,2012-01-01 00:00:00
h,0.0,2.100392,1.408104,bar,True,0


In [33]:
df2["one"].fillna("missing")

a     missing
c     missing
e    0.237258
f     -0.2958
h     missing
Name: one, dtype: object

In [34]:
df

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [38]:
dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff

Unnamed: 0,A,B,C
0,-0.255367,-0.53923,2.847722
1,0.108969,1.413062,0.272874
2,-0.628343,0.032028,0.253052
3,,1.475702,-2.411105
4,,,1.77239
5,-0.632499,,
6,0.040124,-0.63228,
7,-0.317891,0.979361,
8,-0.958165,0.69749,0.561473
9,1.58048,0.889468,2.156919


In [37]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,-1.335023,-2.052158,-0.644278
1,-0.3418,0.495224,-1.467893
2,0.384548,-1.368252,0.919045
3,-0.276416,0.486822,-0.072087
4,-0.276416,-0.550365,-0.14978
5,1.051745,-0.550365,-0.435179
6,0.204043,0.839498,-0.435179
7,-0.99204,-1.832423,-0.435179
8,-1.854172,-1.165221,-1.498277
9,0.67137,0.193584,-0.132981


In [39]:
dff.fillna(dff.mean()["B":"C"])

Unnamed: 0,A,B,C
0,-0.255367,-0.53923,2.847722
1,0.108969,1.413062,0.272874
2,-0.628343,0.032028,0.253052
3,,1.475702,-2.411105
4,,0.53945,1.77239
5,-0.632499,0.53945,0.779046
6,0.040124,-0.63228,0.779046
7,-0.317891,0.979361,0.779046
8,-0.958165,0.69749,0.561473
9,1.58048,0.889468,2.156919


In [40]:
df

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [42]:
df.dropna(axis=0)

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True


In [43]:
df.dropna(axis=1)

Unnamed: 0,one,two,three,four,five
a,-1.364568,-1.399343,-0.709584,bar,False
c,1.006688,-0.753678,0.300014,bar,True
e,0.237258,-0.627598,-0.313282,bar,True
f,-0.2958,1.380749,-1.515091,bar,False
h,0.36544,2.100392,1.408104,bar,True
