In [12]:
# Import pandas library for data manipulation
import pandas as pd

# Read CSV file into a DataFrame, parsing the 'day' column as datetime
df = pd.read_csv('/Users/rohankondhalkar/Desktop/Pandas/missing.csv', parse_dates=['day'])

# Set the 'day' column as the index of the DataFrame
df.set_index('day', inplace=True)

# Display the DataFrame
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [9]:
# Access the row with index '2017-01-08' from the DataFrame df
# This selects all columns for the specified date
df.loc['2017-01-08']

temperature      NaN
windspeed        NaN
event          Sunny
Name: 2017-01-08 00:00:00, dtype: object

In [15]:
# Fill all missing values (NaN) in the dataframe with 0
# This creates a new dataframe 'new_df' without modifying the original 'df'
new_df = df.fillna(0)
# Display the new dataframe with filled values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [16]:
# Fill missing values in the dataframe with specific values for each column:
# - Replace NaN in 'temperature' column with 0
# - Replace NaN in 'windspeed' column with 0
# - Replace NaN in 'event' column with 'No Event'
new_df = df.fillna({
    'temperature':0,
    'windspeed': 0,
    'event': 'No Event'
})
new_df  # Display the dataframe with filled missing values

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,No Event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,No Event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [18]:
# Fill missing values using forward fill method
filled_df = df.fillna(method="ffill")
filled_df  # Display the dataframe with filled values

  filled_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [20]:
# Fill missing values in the dataframe using forward fill method
# This propagates the last valid observation forward to next valid observation
filled_df = df.ffill()
# Display the dataframe with filled values
filled_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [33]:
# Interpolate missing values in the dataframe using the default method (linear)
# This fills NA/NaN values using interpolation between valid data points
new_df = df.interpolate()
new_df
# Display the dataframe with interpolated values

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [27]:
# Create a copy of the original dataframe to avoid modifying it
new_df = df.copy()

# Get a list of all numeric columns in the dataframe
num_cols = new_df.select_dtypes(include="number").columns
# Fill missing values in numeric columns using interpolation
new_df[num_cols] = new_df[num_cols].interpolate()

# Display the dataframe with interpolated values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [37]:
# Interpolate missing values in the dataframe using time-based interpolation
# This assumes the dataframe index is a datetime type and fills NaN values
# by estimating them based on the time difference between points
new_df = df.interpolate(method = 'time')
new_df  # Display the dataframe with interpolated values

  new_df = df.interpolate(method = 'time')


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [39]:
# Drop rows with any missing values (NaN) from the DataFrame
# and store the result in a new DataFrame called 'new_df'
new_df = df.dropna()
# Display the new DataFrame without missing values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [43]:
df.reset_index(inplace=True)
df  # Display the DataFrame after resetting the index

Unnamed: 0,index,day,temperature,windspeed,event
0,0,2017-01-01,32.0,6.0,Rain
1,1,2017-01-04,,9.0,Sunny
2,2,2017-01-05,28.0,,Snow
3,3,2017-01-06,,7.0,
4,4,2017-01-07,32.0,,Rain
5,5,2017-01-08,,,Sunny
6,6,2017-01-09,,,
7,7,2017-01-10,34.0,8.0,Cloudy
8,8,2017-01-11,40.0,12.0,Sunny


In [48]:
# Drop rows where all values are NaN (missing)
# 'how = all' means a row is dropped only if all its values are NaN
new_df = df.dropna(how = 'all')

# Display the resulting dataframe after removing rows with all NaN values
new_df

Unnamed: 0_level_0,index,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,0,32.0,6.0,Rain
2017-01-04,1,,9.0,Sunny
2017-01-05,2,28.0,,Snow
2017-01-06,3,,7.0,
2017-01-07,4,32.0,,Rain
2017-01-08,5,,,Sunny
2017-01-09,6,,,
2017-01-10,7,34.0,8.0,Cloudy
2017-01-11,8,40.0,12.0,Sunny


In [50]:
# Drop rows where all values are NaN (keep rows with at least 1 non-NaN value)
new_df = df.dropna(thresh=1)
# Display the resulting dataframe
new_df

Unnamed: 0_level_0,index,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,0,32.0,6.0,Rain
2017-01-04,1,,9.0,Sunny
2017-01-05,2,28.0,,Snow
2017-01-06,3,,7.0,
2017-01-07,4,32.0,,Rain
2017-01-08,5,,,Sunny
2017-01-09,6,,,
2017-01-10,7,34.0,8.0,Cloudy
2017-01-11,8,40.0,12.0,Sunny


In [58]:
# Create a date range from Jan 1, 2017 to Jan 11, 2017
dt = pd.date_range('01-01-2017','01-11-2017')
# Convert the date range to a DatetimeIndex object
idx = pd.DatetimeIndex(dt)
# Reindex the DataFrame with the new DatetimeIndex
# This will align the DataFrame with the new index, filling missing dates with NaN
df =df.reindex(idx)
df

Unnamed: 0,index,temperature,windspeed,event
2017-01-01,0.0,32.0,6.0,Rain
2017-01-02,,,,
2017-01-03,,,,
2017-01-04,1.0,,9.0,Sunny
2017-01-05,2.0,28.0,,Snow
2017-01-06,3.0,,7.0,
2017-01-07,4.0,32.0,,Rain
2017-01-08,5.0,,,Sunny
2017-01-09,6.0,,,
2017-01-10,7.0,34.0,8.0,Cloudy


In [62]:
# Export the DataFrame 'df' to a CSV file at the specified path
# This saves all data from the DataFrame to a local file for later use or sharing
df.to_csv("/Users/rohankondhalkar/Desktop/Pandas/NewDataForMissing.csv")