In [1]:
# I use my own custom-made datasets to help me learn each step and method better.
# The first dataset, 'weather_data.csv,' is where we start when
# we're getting ready and trying out methods to handle missing data.
# And when it comes to dealing with incorrect values, we go through four datasets one by one:
# 'weather_data1.csv,' 'weather_data2.csv,' 'weather_data3.csv,' and 'weather_data4.csv.'
# This way, it's easier to see how these methods work.

Preparatory Steps

Step #1: Importing Libraries

In [2]:
import pandas as pd
import numpy as np

Step #2: Loading Data

In [3]:
# We load the imaginary weather data from the 'weather_data.csv' file.
# We specify the separator as ';' using the 'sep' parameter, indicating that our file uses this separator.
# Additionally, we parse the 'date' column as datetime for further analysis.
df = pd.read_csv("weather_data.csv", sep=';', parse_dates=['date'])

# Displaying the loaded data frame
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25.0,25.0,0.0,12.0,Sunny
1,2023-01-02,23.0,,0.0,10.0,
2,2023-01-05,,,9.0,,Rain
3,2023-01-06,26.0,62.0,,15.0,Cloudy
4,2023-01-07,,68.0,,14.0,
5,2023-01-08,21.0,80.0,15.0,20.0,Rain
6,2023-01-09,27.0,,2.0,,Cloudy
7,2023-01-10,,72.0,0.0,,Sunny
8,2023-01-11,,,,,
9,2023-01-12,20.0,,10.0,22.0,Rain


Step #3: Checking Data Type

In [4]:
# We verify that the 'date' column is of Pandas Series data type.
type(df['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

Step #4: Setting Date as Index

In [5]:
# We set the 'date' column as the index of the DataFrame using the 'set_index' method.
df.set_index('date', inplace=True)
df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,,0.0,10.0,
2023-01-05,,,9.0,,Rain
2023-01-06,26.0,62.0,,15.0,Cloudy
2023-01-07,,68.0,,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,,2.0,,Cloudy
2023-01-10,,72.0,0.0,,Sunny
2023-01-11,,,,,
2023-01-12,20.0,,10.0,22.0,Rain


We have finished the preparatory steps

---

Data Cleaning Methods

Method #1: Handling Missing Values (Fill with 0)

In [6]:
# We create a new DataFrame 'new_df' by filling missing values in the original DataFrame 'df' with zeros.
new_df = df.fillna(0)
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,0.0,0.0,10.0,0
2023-01-05,0.0,0.0,9.0,0.0,Rain
2023-01-06,26.0,62.0,0.0,15.0,Cloudy
2023-01-07,0.0,68.0,0.0,14.0,0
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,0.0,2.0,0.0,Cloudy
2023-01-10,0.0,72.0,0.0,0.0,Sunny
2023-01-11,0.0,0.0,0.0,0.0,0
2023-01-12,20.0,0.0,10.0,22.0,Rain


Method #2: Handling Missing Values (Custom Fill Values)

In [7]:
# We create a new DataFrame 'new_df' and fill missing values in specific columns with custom values.
# - 'temperature', 'humidity', 'precipitation', and 'windspeed' columns are filled with 0.
# - 'event' column is filled with 'no event' for missing values.
new_df = df.fillna({
    'temperature': 0,
    'humidity': 0,
    'precipitation': 0,
    'windspeed': 0,
    'event': 'no event'
})
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,0.0,0.0,10.0,no event
2023-01-05,0.0,0.0,9.0,0.0,Rain
2023-01-06,26.0,62.0,0.0,15.0,Cloudy
2023-01-07,0.0,68.0,0.0,14.0,no event
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,0.0,2.0,0.0,Cloudy
2023-01-10,0.0,72.0,0.0,0.0,Sunny
2023-01-11,0.0,0.0,0.0,0.0,no event
2023-01-12,20.0,0.0,10.0,22.0,Rain


Method #3: Handling Missing Values (Forward Fill)

In [8]:
# We create a new DataFrame 'new_df' and fill missing values by propagating the previous valid value forward in the respective column.
new_df = df.fillna(method='ffill')
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,25.0,0.0,10.0,Sunny
2023-01-05,23.0,25.0,9.0,10.0,Rain
2023-01-06,26.0,62.0,9.0,15.0,Cloudy
2023-01-07,26.0,68.0,9.0,14.0,Cloudy
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,80.0,2.0,20.0,Cloudy
2023-01-10,27.0,72.0,0.0,20.0,Sunny
2023-01-11,27.0,72.0,0.0,20.0,Sunny
2023-01-12,20.0,72.0,10.0,22.0,Rain


Method #4: Handling Missing Values (Backward Fill)

In [9]:
# We create a new DataFrame 'new_df' and fill missing values by propagating the next valid value backward in the respective column.
new_df = df.fillna(method='bfill')
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,62.0,0.0,10.0,Rain
2023-01-05,26.0,62.0,9.0,15.0,Rain
2023-01-06,26.0,62.0,15.0,15.0,Cloudy
2023-01-07,21.0,68.0,15.0,14.0,Rain
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,72.0,2.0,22.0,Cloudy
2023-01-10,20.0,72.0,0.0,22.0,Sunny
2023-01-11,20.0,,10.0,22.0,Rain
2023-01-12,20.0,,10.0,22.0,Rain


Method #5: Handling Missing Values (Fill Across Columns)

In [10]:
# We create a new DataFrame 'new_df' and fill missing values by propagating the next valid value backward across columns.
new_df = df.fillna(method='bfill', axis='columns')
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,0.0,0.0,10.0,
2023-01-05,9.0,9.0,9.0,Rain,Rain
2023-01-06,26.0,62.0,15.0,15.0,Cloudy
2023-01-07,68.0,68.0,14.0,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,2.0,2.0,Cloudy,Cloudy
2023-01-10,72.0,72.0,0.0,Sunny,Sunny
2023-01-11,,,,,
2023-01-12,20.0,10.0,10.0,22.0,Rain


Method #6: Handling Missing Values (Fill with Limit)

In [11]:
# The 'limit=1' parameter restricts forward filling to only one consecutive missing value in each column.
new_df = df.fillna(method='ffill', limit=1)
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,25.0,0.0,10.0,Sunny
2023-01-05,23.0,,9.0,10.0,Rain
2023-01-06,26.0,62.0,9.0,15.0,Cloudy
2023-01-07,26.0,68.0,,14.0,Cloudy
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,80.0,2.0,20.0,Cloudy
2023-01-10,27.0,72.0,0.0,,Sunny
2023-01-11,,72.0,0.0,,Sunny
2023-01-12,20.0,,10.0,22.0,Rain


In [12]:
# You can find additional information about the 'fillna' function in the documentation.
# It provides details on various options and parameters for filling missing values.

Method #7: Handling Missing Values (Interpolation)

In [13]:
# We create a new DataFrame 'new_df' and fill missing values using interpolation, which estimates the missing values based on neighboring data points.
new_df = df.interpolate()
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,37.333333,0.0,10.0,
2023-01-05,24.5,49.666667,9.0,12.5,Rain
2023-01-06,26.0,62.0,11.0,15.0,Cloudy
2023-01-07,23.5,68.0,13.0,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,76.0,2.0,20.5,Cloudy
2023-01-10,24.666667,72.0,0.0,21.0,Sunny
2023-01-11,22.333333,72.0,5.0,21.5,
2023-01-12,20.0,72.0,10.0,22.0,Rain


Method #8: Handling Missing Values (Time-based Interpolation)

In [14]:
# This method estimates missing values based on the time index, making it suitable for time series data.
new_df = df.interpolate(method='time')
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,32.4,0.0,10.0,
2023-01-05,25.25,54.6,9.0,13.75,Rain
2023-01-06,26.0,62.0,11.0,15.0,Cloudy
2023-01-07,23.5,68.0,13.0,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,76.0,2.0,20.5,Cloudy
2023-01-10,24.666667,72.0,0.0,21.0,Sunny
2023-01-11,22.333333,72.0,5.0,21.5,
2023-01-12,20.0,72.0,10.0,22.0,Rain


In [15]:
# You can find additional information about the 'interpolation' function in the documentation.
# It provides details on various options and parameters for filling missing values.

Method #9: Handling Missing Values (Row Removal)

In [16]:
# We create a new DataFrame 'new_df' by removing rows containing any missing values.
new_df = df.dropna()
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-08,21.0,80.0,15.0,20.0,Rain


Method #10: Handling Missing Values (Row Removal - All NaN)

In [17]:
# We create a new DataFrame 'new_df' by removing rows where all values are missing (NaN).
new_df = df.dropna(how='all')
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,,0.0,10.0,
2023-01-05,,,9.0,,Rain
2023-01-06,26.0,62.0,,15.0,Cloudy
2023-01-07,,68.0,,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,,2.0,,Cloudy
2023-01-10,,72.0,0.0,,Sunny
2023-01-12,20.0,,10.0,22.0,Rain


Method #11: Handling Missing Values (Row Removal - Threshold)

In [18]:
# We create a new DataFrame 'new_df' by removing rows that have fewer than 3 non-missing values.
new_df = df.dropna(thresh=3)
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,,0.0,10.0,
2023-01-06,26.0,62.0,,15.0,Cloudy
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,,2.0,,Cloudy
2023-01-10,,72.0,0.0,,Sunny
2023-01-12,20.0,,10.0,22.0,Rain


Additional Step: Creating a Full Date Range DataFrame

In [19]:
# We create a full date range using the 'pd.date_range' function, spanning from '01-01-2023' to '01-12-2023'.
# Then, we use this date range to create a new DataFrame 'full_df' by reindexing the original DataFrame 'df'.
# This step ensures that we have a complete date range in the DataFrame, even if some dates were missing.
dt = pd.date_range('01-01-2023', '01-12-2023')
idx = pd.DatetimeIndex(dt)
full_df = df.reindex(idx)
full_df

Unnamed: 0,temperature,humidity,precipitation,windspeed,event
2023-01-01,25.0,25.0,0.0,12.0,Sunny
2023-01-02,23.0,,0.0,10.0,
2023-01-03,,,,,
2023-01-04,,,,,
2023-01-05,,,9.0,,Rain
2023-01-06,26.0,62.0,,15.0,Cloudy
2023-01-07,,68.0,,14.0,
2023-01-08,21.0,80.0,15.0,20.0,Rain
2023-01-09,27.0,,2.0,,Cloudy
2023-01-10,,72.0,0.0,,Sunny


---

Data Cleansing: Techniques for Dealing with Incorrect Values

Method #1: Replacing a Specific Value with NaN

In [20]:
df = pd.read_csv("weather_data1.csv", sep=';', parse_dates=['date'])
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25,25,0,12,Sunny
1,2023-01-02,23,1000,0,10,No Event
2,2023-01-05,1000,1000,9,1000,Rain
3,2023-01-06,26,62,1000,15,Cloudy
4,2023-01-07,1000,68,1000,14,No Event
5,2023-01-08,21,80,15,20,Rain
6,2023-01-09,27,1000,2,1000,Cloudy
7,2023-01-10,1000,72,0,1000,Sunny
8,2023-01-11,1000,1000,1000,1000,No Event
9,2023-01-12,20,1000,10,22,Rain


In [21]:
# We replace all instances of a specific value (e.g., 1000) with NaN in the DataFrame to identify and flag outliers.
new_df = df.replace(1000, np.NaN)
new_df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25.0,25.0,0.0,12.0,Sunny
1,2023-01-02,23.0,,0.0,10.0,No Event
2,2023-01-05,,,9.0,,Rain
3,2023-01-06,26.0,62.0,,15.0,Cloudy
4,2023-01-07,,68.0,,14.0,No Event
5,2023-01-08,21.0,80.0,15.0,20.0,Rain
6,2023-01-09,27.0,,2.0,,Cloudy
7,2023-01-10,,72.0,0.0,,Sunny
8,2023-01-11,,,,,No Event
9,2023-01-12,20.0,,10.0,22.0,Rain


Method #2: Replacing Multiple Values with NaN

In [22]:
df = pd.read_csv("weather_data2.csv", sep=';', parse_dates=['date'])
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25,25,0,12,Sunny
1,2023-01-02,23,1000,0,10,No Event
2,2023-01-05,1000,1000,9,1000,Rain
3,2023-01-06,26,62,-1000,15,Cloudy
4,2023-01-07,-1000,68,1000,14,No Event
5,2023-01-08,21,80,15,20,Rain
6,2023-01-09,27,1000,2,-1000,Cloudy
7,2023-01-10,1000,72,0,1000,Sunny
8,2023-01-11,1000,1000,-1000,1000,No Event
9,2023-01-12,20,1000,10,22,Rain


In [23]:
# We replace all instances of specific values (e.g., 1000 and -1000) with NaN in the DataFrame to identify and flag outliers.
new_df = df.replace([1000, -1000], np.NaN)
new_df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25.0,25.0,0.0,12.0,Sunny
1,2023-01-02,23.0,,0.0,10.0,No Event
2,2023-01-05,,,9.0,,Rain
3,2023-01-06,26.0,62.0,,15.0,Cloudy
4,2023-01-07,,68.0,,14.0,No Event
5,2023-01-08,21.0,80.0,15.0,20.0,Rain
6,2023-01-09,27.0,,2.0,,Cloudy
7,2023-01-10,,72.0,0.0,,Sunny
8,2023-01-11,,,,,No Event
9,2023-01-12,20.0,,10.0,22.0,Rain


Method #3: Replacing Specific Values in Columns with NaN

In [24]:
df = pd.read_csv("weather_data3.csv", sep=';', parse_dates=['date'])
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25,25,0,12,Sunny
1,2023-01-02,23,1000,0,10,0
2,2023-01-05,1000,1000,9,1000,Rain
3,2023-01-06,26,62,1000,15,Cloudy
4,2023-01-07,1000,68,1000,14,0
5,2023-01-08,21,80,15,20,Rain
6,2023-01-09,27,1000,2,1000,Cloudy
7,2023-01-10,1000,72,0,1000,Sunny
8,2023-01-11,1000,1000,1000,1000,0
9,2023-01-12,20,1000,10,22,Rain


In [25]:
# We replace specific values in columns ('temperature', 'humidity', 'precipitation', 'windspeed', and 'event') with NaN in the DataFrame.
new_df = df.replace({
    'temperature': 1000,
    'humidity': 1000,
    'precipitation': 1000,
    'windspeed': 1000,
    'event': '0'
}, np.NaN)
new_df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25.0,25.0,0.0,12.0,Sunny
1,2023-01-02,23.0,,0.0,10.0,
2,2023-01-05,,,9.0,,Rain
3,2023-01-06,26.0,62.0,,15.0,Cloudy
4,2023-01-07,,68.0,,14.0,
5,2023-01-08,21.0,80.0,15.0,20.0,Rain
6,2023-01-09,27.0,,2.0,,Cloudy
7,2023-01-10,,72.0,0.0,,Sunny
8,2023-01-11,,,,,
9,2023-01-12,20.0,,10.0,22.0,Rain


Method #4: Replacing Specific Values Across the DataFrame

In [26]:
df = pd.read_csv("weather_data1.csv", sep=';', parse_dates=['date'])
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25,25,0,12,Sunny
1,2023-01-02,23,1000,0,10,No Event
2,2023-01-05,1000,1000,9,1000,Rain
3,2023-01-06,26,62,1000,15,Cloudy
4,2023-01-07,1000,68,1000,14,No Event
5,2023-01-08,21,80,15,20,Rain
6,2023-01-09,27,1000,2,1000,Cloudy
7,2023-01-10,1000,72,0,1000,Sunny
8,2023-01-11,1000,1000,1000,1000,No Event
9,2023-01-12,20,1000,10,22,Rain


In [27]:
# We replace specific values (e.g., 1000 with NaN and 'No Event' with 'Sunny') across the entire DataFrame.
new_df = df.replace({
    1000: np.NaN,
    'No Event': 'Sunny'
})
new_df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25.0,25.0,0.0,12.0,Sunny
1,2023-01-02,23.0,,0.0,10.0,Sunny
2,2023-01-05,,,9.0,,Rain
3,2023-01-06,26.0,62.0,,15.0,Cloudy
4,2023-01-07,,68.0,,14.0,Sunny
5,2023-01-08,21.0,80.0,15.0,20.0,Rain
6,2023-01-09,27.0,,2.0,,Cloudy
7,2023-01-10,,72.0,0.0,,Sunny
8,2023-01-11,,,,,Sunny
9,2023-01-12,20.0,,10.0,22.0,Rain


Method #5: Removing Non-Numeric Characters in Specific Columns

In [28]:
df = pd.read_csv("weather_data4.csv", sep=';', parse_dates=['date'])
df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25 C,25,0 mm,12,Sunny
1,2023-01-02,23 C,1000,0 mm,10,No Event
2,2023-01-05,1000,1000,9 mm,1000,Rain
3,2023-01-06,26 C,62,1000,15,Cloudy
4,2023-01-07,1000,68,1000,14,No Event
5,2023-01-08,21 C,80,15 mm,20,Rain
6,2023-01-09,27 C,1000,2 mm,1000,Cloudy
7,2023-01-10,1000,72,0 mm,1000,Sunny
8,2023-01-11,1000,1000,1000,1000,No Event
9,2023-01-12,20 C,1000,10 mm,22,Rain


In [29]:
# We remove non-numeric characters (e.g., letters) from the 'temperature' and 'precipitation' columns using regular expressions.
# This helps ensure that these columns contain only numeric values for analysis.
new_df = df.replace({
    'temperature': '[A-Za-z]',
    'precipitation': '[A-Za-z]'
}, '', regex=True)
new_df

Unnamed: 0,date,temperature,humidity,precipitation,windspeed,event
0,2023-01-01,25,25,0,12,Sunny
1,2023-01-02,23,1000,0,10,No Event
2,2023-01-05,1000,1000,9,1000,Rain
3,2023-01-06,26,62,1000,15,Cloudy
4,2023-01-07,1000,68,1000,14,No Event
5,2023-01-08,21,80,15,20,Rain
6,2023-01-09,27,1000,2,1000,Cloudy
7,2023-01-10,1000,72,0,1000,Sunny
8,2023-01-11,1000,1000,1000,1000,No Event
9,2023-01-12,20,1000,10,22,Rain


Method #6: Replacing Categorical Values with Numeric Codes

In [30]:
df = pd.read_csv("weather_data1.csv", sep=';', parse_dates=['date'])
df.set_index('date', inplace=True)
df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25,25,0,12,Sunny
2023-01-02,23,1000,0,10,No Event
2023-01-05,1000,1000,9,1000,Rain
2023-01-06,26,62,1000,15,Cloudy
2023-01-07,1000,68,1000,14,No Event
2023-01-08,21,80,15,20,Rain
2023-01-09,27,1000,2,1000,Cloudy
2023-01-10,1000,72,0,1000,Sunny
2023-01-11,1000,1000,1000,1000,No Event
2023-01-12,20,1000,10,22,Rain


In [31]:
# We replace categorical weather values ('No Event', 'Sunny', 'Cloudy', 'Rain') with corresponding numeric codes (0, 1, 2, 3).
# This conversion makes it easier to work with the data for analysis or modeling.
new_df = df.replace(['No Event', 'Sunny', 'Cloudy', 'Rain'], [0, 1, 2, 3])
new_df

Unnamed: 0_level_0,temperature,humidity,precipitation,windspeed,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,25,25,0,12,1
2023-01-02,23,1000,0,10,0
2023-01-05,1000,1000,9,1000,3
2023-01-06,26,62,1000,15,2
2023-01-07,1000,68,1000,14,0
2023-01-08,21,80,15,20,3
2023-01-09,27,1000,2,1000,2
2023-01-10,1000,72,0,1000,1
2023-01-11,1000,1000,1000,1000,0
2023-01-12,20,1000,10,22,3
