In [15]:
import pandas as pd

# Define the data
data = {
    'day': pd.date_range(start="2017-01-01", end="2017-01-11"),
    'temperature': [32, None, 28, None, 30, None, 29, None, 31, 33, None],
    'windspeed': [12, 15, None, None, 8, 12, 10, None, 14, 10, 11],
    'event': ['Sunny', 'Cloudy', 'Sunny', None, 'Rainy', 'Sunny', 'Windy', 'Cloudy', 'Sunny', None, None]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Convert 'day' to datetime and set as index
df['day'] = pd.to_datetime(df['day'])
df.set_index('day', inplace=True)

# Save the DataFrame to a CSV file
df.to_csv("Missing_weather_data.csv", index=True)


In [16]:
new_df = df.fillna(0)
print(new_df) # this .fillna function replace all NaN values with 0.0 :
# sometimes replacing 0 is not the best case :
# if you have to replace different values for different data set then we use dictionaries :
new_df = df.fillna({
    'temperature' : 0,
    'windspeed' : 0,
    'event' : 'No event'
})

            temperature  windspeed   event
day                                       
2017-01-01         32.0       12.0   Sunny
2017-01-02          0.0       15.0  Cloudy
2017-01-03         28.0        0.0   Sunny
2017-01-04          0.0        0.0       0
2017-01-05         30.0        8.0   Rainy
2017-01-06          0.0       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08          0.0        0.0  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0       0
2017-01-11          0.0       11.0       0


In [17]:
print(new_df) # this will replace values by values that we have given in our dictinary :
# the other way of finding best case is  forward or backward values :
new_df = df.fillna(method='ffill') # ffill = forward fill is used to forward values : 

            temperature  windspeed     event
day                                         
2017-01-01         32.0       12.0     Sunny
2017-01-02          0.0       15.0    Cloudy
2017-01-03         28.0        0.0     Sunny
2017-01-04          0.0        0.0  No event
2017-01-05         30.0        8.0     Rainy
2017-01-06          0.0       12.0     Sunny
2017-01-07         29.0       10.0     Windy
2017-01-08          0.0        0.0    Cloudy
2017-01-09         31.0       14.0     Sunny
2017-01-10         33.0       10.0  No event
2017-01-11          0.0       11.0  No event


  new_df = df.fillna(method='ffill') # ffill = forward fill is used to forward values :


In [18]:
# means ex. 1st day temp  is 32 and 4th day temp is 0 so ffill forward 1st day value in 4th day :
print(new_df) 

            temperature  windspeed   event
day                                       
2017-01-01         32.0       12.0   Sunny
2017-01-02         32.0       15.0  Cloudy
2017-01-03         28.0       15.0   Sunny
2017-01-04         28.0       15.0   Sunny
2017-01-05         30.0        8.0   Rainy
2017-01-06         30.0       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08         29.0       10.0  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0   Sunny
2017-01-11         33.0       11.0   Sunny


In [19]:
# you can also use bfill = backward fill it will backward values :
# means ex. 5th day temp  is 28 and 4th day temp is 0 so bfill forward 1st day value in 4th day :
new_df = df.fillna(method='bfill')
print(new_df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0       12.0   Sunny
2017-01-02         28.0       15.0  Cloudy
2017-01-03         28.0        8.0   Sunny
2017-01-04         30.0        8.0   Rainy
2017-01-05         30.0        8.0   Rainy
2017-01-06         29.0       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08         31.0       14.0  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0    None
2017-01-11          NaN       11.0    None


  new_df = df.fillna(method='bfill')


In [20]:
# if you want to forward or backward values horizontally so for that you can use axis='columns' :
# new_df = df.fillna(method='ffill',axis='columns')
# print(new_df) # it will forward values horizontally 

new_df = df.fillna(method='bfill',axis='columns')
print(new_df) # it will backward values horizontally 

           temperature windspeed   event
day                                     
2017-01-01        32.0      12.0   Sunny
2017-01-02        15.0      15.0  Cloudy
2017-01-03        28.0     Sunny   Sunny
2017-01-04        None      None    None
2017-01-05        30.0       8.0   Rainy
2017-01-06        12.0      12.0   Sunny
2017-01-07        29.0      10.0   Windy
2017-01-08      Cloudy    Cloudy  Cloudy
2017-01-09        31.0      14.0   Sunny
2017-01-10        33.0      10.0     NaN
2017-01-11        11.0      11.0     NaN


  new_df = df.fillna(method='bfill',axis='columns')
  new_df = df.fillna(method='bfill',axis='columns')


In [21]:
# if you forward or backward value only one time then you can set limint :
# new_df = df.fillna(method='bfill',limit=1)
# print(new_df) # you can see that 4th month value is forward only ones 6th month remains NaN :
# index is day so you can't run this code in this program :
# for best case of replace values of front and rear values avg value for that you can use interpolate() function :
new_df = df.interpolate()
print(new_df) # interpolate() will find the best case to replace your dat

            temperature  windspeed   event
day                                       
2017-01-01         32.0  12.000000   Sunny
2017-01-02         30.0  15.000000  Cloudy
2017-01-03         28.0  12.666667   Sunny
2017-01-04         29.0  10.333333    None
2017-01-05         30.0   8.000000   Rainy
2017-01-06         29.5  12.000000   Sunny
2017-01-07         29.0  10.000000   Windy
2017-01-08         30.0  12.000000  Cloudy
2017-01-09         31.0  14.000000   Sunny
2017-01-10         33.0  10.000000    None
2017-01-11         33.0  11.000000    None


  new_df = df.interpolate()


In [22]:
# ex. 1st day temp is 32 and 5th day temp is 28 and avg temp is 30 so interpolate() function replace midddle value with avg value 30 this is the best case to replace values :
print("***************************************")
new_df = df.interpolate(method="time")
print(new_df) # this is the best case ever when you replace values time by time it will replace 4th day value with avg of 1st and 5th day and then he set value as 4th day and replacde it :

  new_df = df.interpolate(method="time")


***************************************
            temperature  windspeed   event
day                                       
2017-01-01         32.0  12.000000   Sunny
2017-01-02         30.0  15.000000  Cloudy
2017-01-03         28.0  12.666667   Sunny
2017-01-04         29.0  10.333333    None
2017-01-05         30.0   8.000000   Rainy
2017-01-06         29.5  12.000000   Sunny
2017-01-07         29.0  10.000000   Windy
2017-01-08         30.0  12.000000  Cloudy
2017-01-09         31.0  14.000000   Sunny
2017-01-10         33.0  10.000000    None
2017-01-11         33.0  11.000000    None


In [23]:
# if you don't want to print any NaN values data set then you can use .dropna() func :
new_df = df.dropna()
print(new_df)

            temperature  windspeed  event
day                                      
2017-01-01         32.0       12.0  Sunny
2017-01-05         30.0        8.0  Rainy
2017-01-07         29.0       10.0  Windy
2017-01-09         31.0       14.0  Sunny


In [24]:
# if you don't want to print only those data who has all data as NaN then you can use 'how' :
new_df = df.dropna(how='all')
print(new_df) # this will print only those data who don't have any NaN values :

            temperature  windspeed   event
day                                       
2017-01-01         32.0       12.0   Sunny
2017-01-02          NaN       15.0  Cloudy
2017-01-03         28.0        NaN   Sunny
2017-01-05         30.0        8.0   Rainy
2017-01-06          NaN       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08          NaN        NaN  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0    None
2017-01-11          NaN       11.0    None


In [25]:
# if you want to print only those data who have only one or more than one NaN values then you can use thresold parameter thresh=NaN_counts :
new_df = df.dropna(thresh=1)
print(new_df)

            temperature  windspeed   event
day                                       
2017-01-01         32.0       12.0   Sunny
2017-01-02          NaN       15.0  Cloudy
2017-01-03         28.0        NaN   Sunny
2017-01-05         30.0        8.0   Rainy
2017-01-06          NaN       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08          NaN        NaN  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0    None
2017-01-11          NaN       11.0    None


In [26]:
# inserting the missing dates :
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
df = df.reindex(idx)
print(df)

            temperature  windspeed   event
2017-01-01         32.0       12.0   Sunny
2017-01-02          NaN       15.0  Cloudy
2017-01-03         28.0        NaN   Sunny
2017-01-04          NaN        NaN    None
2017-01-05         30.0        8.0   Rainy
2017-01-06          NaN       12.0   Sunny
2017-01-07         29.0       10.0   Windy
2017-01-08          NaN        NaN  Cloudy
2017-01-09         31.0       14.0   Sunny
2017-01-10         33.0       10.0    None
2017-01-11          NaN       11.0    None
