In [1]:
##1. IMPORT THE NECESSARY LIBRARIES AND LOAD DATA
import pandas as pd
df = pd.read_csv('Covid-19.csv')

#preview the first few rows
df.head()
###print(df) - preview all data 

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


In [2]:
##2 Correcting inconsistencies
## 2.1 standardize column names/ lowercase,remove spaces)

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [3]:
## checking for the duplicate country names
## Sometimes the same country name appears twice under different names ( USA vs United States)

duplicates = df['country/region'].value_counts()
print(duplicates[duplicates > 1]) ##This filters the Series to show only countries that appear more than once

Series([], Name: count, dtype: int64)


In [4]:
### clean up spaces or capitalization in who _region columns

df['who_region'] = df['who_region'].str.strip().str.title() ### str.strip-> removes any leading or trailing whitespace and str.title()-> comvert text and title case each word starts with a capital letter)

In [5]:
## verify the metric columns 
## ensure numeric values like ' 1 week % increases" or death/100 cases are numeric ,not string

df['1_week_%_increase'] = df['1_week_%_increase'].astype(float)

In [6]:
## task 2 handleling missing values
#1 . Identify missing value
df.isnull().sum() # no missing value // ifthere was a missing value use (df.fillna(0, inplace=True) 

country/region            0
confirmed                 0
deaths                    0
recovered                 0
active                    0
new_cases                 0
new_deaths                0
new_recovered             0
deaths_/_100_cases        0
recovered_/_100_cases     0
deaths_/_100_recovered    0
confirmed_last_week       0
1_week_change             0
1_week_%_increase         0
who_region                0
dtype: int64

In [7]:
## Data formatting
df.dtypes

country/region             object
confirmed                   int64
deaths                      int64
recovered                   int64
active                      int64
new_cases                   int64
new_deaths                  int64
new_recovered               int64
deaths_/_100_cases        float64
recovered_/_100_cases     float64
deaths_/_100_recovered    float64
confirmed_last_week         int64
1_week_change               int64
1_week_%_increase         float64
who_region                 object
dtype: object

In [8]:
# convert data types
df['1_week_%_increase'] = df['1_week_%_increase'].astype(float)

In [9]:
# changing the name on the column.
df.rename(columns={
    '1_week_change': 'weekly_case_change',
    '1_week_%_increase': 'weekly_percentage_increase'
}, inplace=True)

In [10]:
df.dtypes

country/region                 object
confirmed                       int64
deaths                          int64
recovered                       int64
active                          int64
new_cases                       int64
new_deaths                      int64
new_recovered                   int64
deaths_/_100_cases            float64
recovered_/_100_cases         float64
deaths_/_100_recovered        float64
confirmed_last_week             int64
weekly_case_change              int64
weekly_percentage_increase    float64
who_region                     object
dtype: object

In [12]:
# Function to clean column names
def clean_column_name(col):
    return (
        col.strip()                      # Remove leading/trailing spaces
        .lower()                         # Convert to lowercase
        .replace(' ', '_')               # Replace spaces with underscores
        .replace('/', '_')               # Replace slashes with underscores
        .replace('%', 'percent')         # Replace percent symbol with 'percent'
        .replace('-', '_')               # Replace dashes with underscores
        .replace('__', '_')              # Replace double underscores with a single one
    )

# Apply the function to all column names
df.columns = [clean_column_name(col) for col in df.columns]

In [13]:
df.dtypes

country_region                 object
confirmed                       int64
deaths                          int64
recovered                       int64
active                          int64
new_cases                       int64
new_deaths                      int64
new_recovered                   int64
deaths_100_cases              float64
recovered_100_cases           float64
deaths_100_recovered          float64
confirmed_last_week             int64
weekly_case_change              int64
weekly_percentage_increase    float64
who_region                     object
dtype: object

In [15]:
df.rename(columns={
    'countryregion': 'Country_Region',
    'confirmed': 'Confirmed',
    'deaths': 'Deaths',
    'recovered': 'Recovered',
    'active': 'Active',
    'newcases': 'New_Cases',
    'newdeaths': 'New_Deaths',
    'newrecovered': 'New_Recovered',
    'deaths100cases': 'Deaths_Per_100_Cases',
    'recovered100cases': 'Recovered_Per_100_Cases',
    'deaths100recovered': 'Deaths_Per_100_Recovered',
    'confirmedlastweek': 'Confirmed_Last_Week',
    'weeklycasechange': 'Weekly_Case_Change',
    'weeklypercentageincrease': 'Weekly_Percentage_Increase',
    'whoregion': 'Who_Region'
}, inplace=True)

# Step 3: Check the updated column names
print(df.columns.tolist())

['country_region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'new_cases', 'new_deaths', 'new_recovered', 'deaths_100_cases', 'recovered_100_cases', 'deaths_100_recovered', 'confirmed_last_week', 'weekly_case_change', 'weekly_percentage_increase', 'who_region']


In [16]:
df.dtypes

country_region                 object
Confirmed                       int64
Deaths                          int64
Recovered                       int64
Active                          int64
new_cases                       int64
new_deaths                      int64
new_recovered                   int64
deaths_100_cases              float64
recovered_100_cases           float64
deaths_100_recovered          float64
confirmed_last_week             int64
weekly_case_change              int64
weekly_percentage_increase    float64
who_region                     object
dtype: object

In [17]:
## save clean file
df.to_csv('cleaned_covid_data.csv', index=False)