In [1]:
import pandas as pd
import numpy as np

# Read the Excel file
df = pd.read_excel(r'final_cleaned_dataset.xlsx')

# Select the columns of interest (numeric columns)
numeric_columns = df.select_dtypes(include='number').columns

# Set the Z-score threshold for identifying outliers (e.g., threshold of 3)
zscore_threshold = 3

# Initialize a dictionary to store results
results = {}

# Iterate over each numeric column
for column in numeric_columns:
    # Calculate Z-scores for the column
    zscores = np.abs((df[column] - df[column].mean()) / df[column].std())
    
    # Identify outliers based on the Z-score threshold
    outliers = df[zscores >= zscore_threshold]
    
    # Calculate the percentage of outlier values
    percentage_outliers = (len(outliers) / len(df)) * 100
    
    # Determine the range of values that are outliers
    outlier_range_start = df[column][zscores.idxmax()]
    outlier_range_end = df[column][zscores.idxmax()]
    
    for index in outliers.index:
        value = df[column][index]
        if value < outlier_range_start:
            outlier_range_start = value
        elif value > outlier_range_end:
            outlier_range_end = value
    
    # Store results in the dictionary
    results[column] = {
        'percentage_outliers': percentage_outliers,
        'outlier_range_start': outlier_range_start,
        'outlier_range_end': outlier_range_end,
        'outliers': outliers[column].tolist()
    }

# Print results
for column, result in results.items():
    print(f"Column: {column}")
    print(f"Percentage of outlier values: {result['percentage_outliers']:.2f}%")
    print(f"Outlier range: {result['outlier_range_start']} - {result['outlier_range_end']}")
    print(f"Outliers:\n{result['outliers']}\n")


  from pandas.core.computation.check import NUMEXPR_INSTALLED


Column: Unnamed: 0
Percentage of outlier values: 0.00%
Outlier range: 16791 - 16791
Outliers:
[]

Column: Laboratory confirmed, since the beginning of the pandemic TOTAL
Percentage of outlier values: 1.66%
Outlier range: 86653 - 99638
Outliers:
[86653, 86955, 87297, 87534, 87667, 87940, 88435, 88954, 89379, 86929, 89929, 87674, 90548, 88171, 90930, 88571, 91230, 89124, 91572, 89766, 92037, 90095, 92271, 90375, 86870, 92536, 90634, 87338, 92705, 90862, 87749, 92905, 91425, 88079, 93257, 91730, 88302, 93526, 91911, 88478, 93662, 92043, 88673, 93818, 92167, 88826, 93953, 92603, 89110, 94228, 92742, 89318, 94338, 92879, 89403, 94483, 93081, 89588, 94720, 93222, 89702, 94809, 93311, 89831, 94917, 93414, 89910, 95036, 93478, 89987, 95100, 93571, 90161, 95173, 93729, 90304, 95307, 93919, 90375, 95443, 93993, 90443, 95516, 94088, 90497, 95588, 94259, 90621, 95716, 94410, 90775, 95786, 94537, 90876, 95902, 94622, 90949, 95957, 94720, 91011, 96051, 94837, 91127, 96138, 94951, 91198, 96235, 95022