In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load the Data
def load_data(file_path):
    return pd.read_csv(file_path)

# Step 2: Create a Pipeline
def create_pipeline():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
        ('scaler', StandardScaler())                 # Standardize the data
    ])
    return pipeline

# Step 3: Apply the Pipeline
def process_data(data, pipeline):
    # Select numerical columns to process
    numerical_columns = ['fever']
    data[numerical_columns] = pipeline.fit_transform(data[numerical_columns])
    return data

# Step 4: Analyze the Data
def analyze_data(data):
    # Count COVID-positive cases by city
    covid_by_city = data[data['has_covid'] == 'Yes'].groupby('city').size()
    # Count COVID-positive cases by gender
    covid_by_gender = data[data['has_covid'] == 'Yes'].groupby('gender').size()
    return covid_by_city, covid_by_gender

# Combine the pipeline steps
def data_pipeline(file_path):
    data = load_data(file_path)
    pipeline = create_pipeline()
    data = process_data(data, pipeline)
    covid_by_city, covid_by_gender = analyze_data(data)
    return data, covid_by_city, covid_by_gender

# Run the pipeline
file_path = '/mnt/data/covid_toy.csv'
processed_data, covid_by_city, covid_by_gender = data_pipeline(file_path)

# Display results
print("Processed Data:")
print(processed_data.head())
print("\nCOVID-19 Cases by City:")
print(covid_by_city)
print("\nCOVID-19 Cases by Gender:")
print(covid_by_gender)
