In [1]:
import pandas as pd
import numpy as np

# Sample COVID-19 dataset
data = {
    'Date': ['01/05/2020', '02-05-2020', '2020/05/03', '2020-05-04'],
    'Country': ['India', 'U.S.', 'USA', 'India'],
    'Confirmed': [100, -50, 1000000, 200],
    'Recovered': [90, np.nan, 800000, 150],
    'Deaths': [5, 10, 2000000, 20],
    'Vaccinated': [0, 0, np.nan, 60]
}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# --- RULE-BASED PREPROCESSING (using df.loc) ---

# R1: Handle missing values
for i in range(len(df)):
    for col in ['Confirmed', 'Recovered', 'Deaths', 'Vaccinated']:
        if pd.isnull(df.loc[i, col]):
            df.loc[i, col] = 0

# R2: Replace negative numbers with 0
for i in range(len(df)):
    for col in ['Confirmed', 'Recovered', 'Deaths', 'Vaccinated']:
        if df.loc[i, col] < 0:
            df.loc[i, col] = 0

# R3: Standardize date format
for i in range(len(df)):
    try:
        df.loc[i, 'Date'] = pd.to_datetime(df.loc[i, 'Date']).strftime('%Y-%m-%d')
    except:
        df.loc[i, 'Date'] = np.nan  # If date conversion fails, set as NaN

# R4: Fix country names
for i in range(len(df)):
    if df.loc[i, 'Country'] == 'U.S.':
        df.loc[i, 'Country'] = 'USA'
    elif df.loc[i, 'Country'] == 'Brasil':
        df.loc[i, 'Country'] = 'Brazil'

# R5: Logical consistency (Deaths <= Confirmed)
for i in range(len(df)):
    if df.loc[i, 'Deaths'] > df.loc[i, 'Confirmed']:
        df.loc[i, 'Deaths'] = df.loc[i, 'Confirmed']

# R6: Remove duplicates (same Country + Date)
df = df.drop_duplicates(subset=['Country', 'Date'], keep='last').reset_index(drop=True)

# R7: Detect unrealistic jumps (Confirmed > 10Ã— previous day)
df = df.sort_values(by=['Country', 'Date']).reset_index(drop=True)
df['Flag_Unrealistic'] = False
for i in range(1, len(df)):
    if df.loc[i, 'Country'] == df.loc[i-1, 'Country']:
        if df.loc[i, 'Confirmed'] > 10 * df.loc[i-1, 'Confirmed']:
            df.loc[i, 'Flag_Unrealistic'] = True

print("\nProcessed Data:")
print(df)

Original Data:
         Date Country  Confirmed  Recovered   Deaths  Vaccinated
0  01/05/2020   India        100       90.0        5         0.0
1  02-05-2020    U.S.        -50        NaN       10         0.0
2  2020/05/03     USA    1000000   800000.0  2000000         NaN
3  2020-05-04   India        200      150.0       20        60.0

Processed Data:
         Date Country  Confirmed  Recovered   Deaths  Vaccinated  \
0  2020-01-05   India        100       90.0        5         0.0   
1  2020-05-04   India        200      150.0       20        60.0   
2  2020-02-05     USA          0        0.0        0         0.0   
3  2020-05-03     USA    1000000   800000.0  1000000         0.0   

   Flag_Unrealistic  
0             False  
1             False  
2             False  
3              True  
