# Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

This notebook demonstrates different methods for handling missing data in a dataset using Pandas.

In [2]:
# ## Creating the Dataset
# Instead of loading from a CSV, we define the dataset as a dictionary for reusability.

# Define dataset as a dictionary
weather_data = {
    "day": pd.date_range("1/1/2017", periods=10, freq='D'),
    "temperature": [32, np.nan, 28, np.nan, 32, np.nan, np.nan, 34, 40, np.nan],
    "windspeed": [6, 9, np.nan, 7, np.nan, np.nan, np.nan, 8, 12, np.nan],
    "event": ["Rain", "Sunny", "Snow", np.nan, "Rain", "Sunny", np.nan, "Cloudy", "Sunny", np.nan]
}

# Create DataFrame
df = pd.DataFrame(weather_data)
print(df)

         day  temperature  windspeed   event
0 2017-01-01         32.0        6.0    Rain
1 2017-01-02          NaN        9.0   Sunny
2 2017-01-03         28.0        NaN    Snow
3 2017-01-04          NaN        7.0     NaN
4 2017-01-05         32.0        NaN    Rain
5 2017-01-06          NaN        NaN   Sunny
6 2017-01-07          NaN        NaN     NaN
7 2017-01-08         34.0        8.0  Cloudy
8 2017-01-09         40.0       12.0   Sunny
9 2017-01-10          NaN        NaN     NaN


In [3]:
# ## Setting Index
# We set the 'day' column as the index to facilitate time-series operations.
df.set_index('day', inplace=True)
print(df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-07          NaN        NaN     NaN
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-01-10          NaN        NaN     NaN


In [None]:
# # loading from csv and replacing na values
# df = pd.read_csv("../Datasets/stock_data.csv",  na_values={
#         'eps': ['not available'],
#         'revenue': [-1],
#         'people': ['not available','n.a.']
#     })
# df

## Filling Missing Values

In [4]:
# ### Fill all NaN with a specific value
new_df = df.fillna(0)
# print(new_df)

In [5]:
# ### Fill NaN using a dictionary for specific columns
new_df = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'No Event'
})
# print(new_df)

In [6]:
# ### Forward Fill (propagate previous values forward)
new_df = df.fillna(method="ffill")
print(df)
print("-----")
print(new_df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-07          NaN        NaN     NaN
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-01-10          NaN        NaN     NaN
-----
            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02         32.0        9.0   Sunny
2017-01-03         28.0        9.0    Snow
2017-01-04         28.0        7.0    Snow
2017-01-05         32.0        7.0    Rain
2017-01-06         32.0        7.0   Sunny
2017-01-07         32.0        7.0   Sunny
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-

  new_df = df.fillna(method="ffill")


In [8]:
# ### Backward Fill (propagate next values backward)
new_df = df.fillna(method="bfill")
# print(df)
# print("-----")
# print(new_df)

  new_df = df.fillna(method="bfill")


In [9]:
# ### Column-wise Backward Fill
#Uses the rows to fill data
new_df = df.fillna(method="bfill", axis="columns")
# print(df)
# print("-----")
# print(new_df)

  new_df = df.fillna(method="bfill", axis="columns")
  new_df = df.fillna(method="bfill", axis="columns")


## Interpolation

This method estimates missing values based on nearby values.

In [10]:
# Basic interpolation
new_df = df.interpolate()
print(df)
print("-----")
print(new_df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-07          NaN        NaN     NaN
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-01-10          NaN        NaN     NaN
-----
            temperature  windspeed   event
day                                       
2017-01-01    32.000000       6.00    Rain
2017-01-02    30.000000       9.00   Sunny
2017-01-03    28.000000       8.00    Snow
2017-01-04    30.000000       7.00     NaN
2017-01-05    32.000000       7.25    Rain
2017-01-06    32.666667       7.50   Sunny
2017-01-07    33.333333       7.75     NaN
2017-01-08    34.000000       8.00  Cloudy
2017-01-09    40.000000      12.00   Sunny
2017-

  new_df = df.interpolate()


In [12]:
# Time-based interpolation
new_df = df.interpolate(method="time")
# print(df)
# print("-----")
# print(new_df)

  new_df = df.interpolate(method="time")


## Dropping Missing Values

In [None]:
# ### Drop all rows with any NaN values
new_df = df.dropna()
print(new_df)

# ### Drop rows only if all values are NaN
new_df = df.dropna(how='all')
print(new_df)

In [13]:
# ### Drop rows if less than a threshold of non-null values
new_df = df.dropna(thresh=1)
print(df)
print("-----")
print(new_df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-07          NaN        NaN     NaN
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-01-10          NaN        NaN     NaN
-----
            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny


## Inserting Missing Dates

In [14]:
# Sometimes, datasets miss certain dates. We can insert missing dates explicitly.

dt = pd.date_range("01-01-2017", "01-11-2017")
idx = pd.DatetimeIndex(dt)
df = df.reindex(idx)
print(df)

            temperature  windspeed   event
2017-01-01         32.0        6.0    Rain
2017-01-02          NaN        9.0   Sunny
2017-01-03         28.0        NaN    Snow
2017-01-04          NaN        7.0     NaN
2017-01-05         32.0        NaN    Rain
2017-01-06          NaN        NaN   Sunny
2017-01-07          NaN        NaN     NaN
2017-01-08         34.0        8.0  Cloudy
2017-01-09         40.0       12.0   Sunny
2017-01-10          NaN        NaN     NaN
2017-01-11          NaN        NaN     NaN


# Additional Data Cleaning Example

In [15]:
# This section includes another dataset showcasing how to handle missing values in different contexts.

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

df_people = pd.DataFrame(people)
print(df_people)

   first     last                    email      age
0  Corey  Schafer  CoreyMSchafer@gmail.com       33
1   Jane      Doe        JaneDoe@email.com       55
2   John      Doe        JohnDoe@email.com       63
3  Chris  Schafer                     None       36
4    NaN      NaN                      NaN     None
5   None      NaN      Anonymous@email.com     None
6     NA  Missing                       NA  Missing


In [16]:
# Replacing placeholders with NaN
df_people.replace(['NA', 'Missing'], np.nan, inplace=True)
print(df_people)

   first     last                    email   age
0  Corey  Schafer  CoreyMSchafer@gmail.com    33
1   Jane      Doe        JaneDoe@email.com    55
2   John      Doe        JohnDoe@email.com    63
3  Chris  Schafer                     None    36
4    NaN      NaN                      NaN  None
5   None      NaN      Anonymous@email.com  None
6    NaN      NaN                      NaN   NaN


In [17]:
# Drop only if all specified columns have NaN
df_filtered = df_people.dropna(subset=['last', 'email'], how='all')
print(df_filtered)

   first     last                    email   age
0  Corey  Schafer  CoreyMSchafer@gmail.com    33
1   Jane      Doe        JaneDoe@email.com    55
2   John      Doe        JohnDoe@email.com    63
3  Chris  Schafer                     None    36
5   None      NaN      Anonymous@email.com  None


In [18]:
# Checking for missing values
df_people.isna()


Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [19]:
# Fill missing values with 0
df_filled = df_people.fillna(0)
print(df_filled)

   first     last                    email age
0  Corey  Schafer  CoreyMSchafer@gmail.com  33
1   Jane      Doe        JaneDoe@email.com  55
2   John      Doe        JohnDoe@email.com  63
3  Chris  Schafer                        0  36
4      0        0                        0   0
5      0        0      Anonymous@email.com   0
6      0        0                        0   0


In [20]:
# Converting age to numeric
df_people['age'] = pd.to_numeric(df_people['age'], errors='coerce')
print(df_people.dtypes)


first     object
last      object
email     object
age      float64
dtype: object
