In [1]:
import pandas as pd
import os # We'll use this briefly to confirm the working directory

# Confirmation (Optional, but good practice):
print(f"Current Working Directory: {os.getcwd()}") 
print("-" * 30)

# 1. Define the File Path
file_name = 'CONVENIENT_global_confirmed_cases.csv'
# The path works because you launched Jupyter from the root, 
# and the data folder is directly inside the root.
file_path = 'data/' + file_name 

# 2. Load the Data
try:
    df_confirmed = pd.read_csv(file_path) 
    
    # 3. Initial Inspection
    print("✅ Data successfully loaded!")
    print("\n--- First 5 Rows (Head) ---")
    print(df_confirmed.head()) 
    print("\n--- Data Information (Columns and Types) ---")
    df_confirmed.info() # Use the function directly here
    
except FileNotFoundError:
    print(f"❌ Error: Something is still wrong with the path: {file_path}")
    print("Please double-check the file name or folder.")

Current Working Directory: C:\Users\user\covid-data-dashboard
------------------------------
✅ Data successfully loaded!

--- First 5 Rows (Head) ---
   Country/Region  Afghanistan  Albania  Algeria  Andorra  Angola  Antarctica  \
0  Province/State          NaN      NaN      NaN      NaN     NaN         NaN   
1         1/23/20          0.0      0.0      0.0      0.0     0.0         0.0   
2         1/24/20          0.0      0.0      0.0      0.0     0.0         0.0   
3         1/25/20          0.0      0.0      0.0      0.0     0.0         0.0   
4         1/26/20          0.0      0.0      0.0      0.0     0.0         0.0   

   Antigua and Barbuda  Argentina  Armenia  ... Uruguay Uzbekistan Vanuatu  \
0                  NaN        NaN      NaN  ...     NaN        NaN     NaN   
1                  0.0        0.0      0.0  ...     0.0        0.0     0.0   
2                  0.0        0.0      0.0  ...     0.0        0.0     0.0   
3                  0.0        0.0      0.0  ...    

In [5]:
# The loaded DataFrame is df_confirmed (from the previous step)

# ----------------------------------------------------------------------
# 1. Rename the Date Column
# ----------------------------------------------------------------------
df_confirmed = df_confirmed.rename(columns={'Country/Region': 'Date'})

# ----------------------------------------------------------------------
# 2. Melt the Data (Convert from Wide to Long Format)
# ----------------------------------------------------------------------
df_long = df_confirmed.melt(
    id_vars=['Date'],        
    var_name='Country/Region', 
    value_name='Confirmed_Cases' 
)

# ----------------------------------------------------------------------
# 3. Clean and Inspect the Final Data (CRITICAL FIXES HERE)
# ----------------------------------------------------------------------
# FIX 1: Convert Dates (using coerce and dropping NaT for the "Province/State" artifact)
df_long['Date'] = pd.to_datetime(
    df_long['Date'], 
    format='%m/%d/%y', 
    errors='coerce'
)
df_long.dropna(subset=['Date'], inplace=True)

# FIX 2: Convert Confirmed_Cases to numeric, coercing any non-numeric strings to NaN
# This solves the TypeError: '>' not supported between instances of 'str' and 'int'
df_long['Confirmed_Cases'] = pd.to_numeric(
    df_long['Confirmed_Cases'], 
    errors='coerce'
)

# Remove any rows where cases were non-numeric (now NaN)
df_long.dropna(subset=['Confirmed_Cases'], inplace=True)

# Drop rows where the confirmed cases are zero 
df_long = df_long[df_long['Confirmed_Cases'] > 0] 

# Cast the Confirmed_Cases column to the final integer type
df_long['Confirmed_Cases'] = df_long['Confirmed_Cases'].astype(int)


print("✅ Data cleaning complete. The DataFrame is ready for analysis!")
print("\n--- Final Cleaned Data Head ---")
print(df_long.head())
print("\n--- Final Data Information ---")
df_long.info()

✅ Data cleaning complete. The DataFrame is ready for analysis!

--- Final Cleaned Data Head ---
         Date Country/Region  Confirmed_Cases
33 2020-02-24    Afghanistan                5
45 2020-03-07    Afghanistan                3
49 2020-03-11    Afghanistan                3
52 2020-03-14    Afghanistan                3
53 2020-03-15    Afghanistan                6

--- Final Data Information ---
<class 'pandas.core.frame.DataFrame'>
Index: 175728 entries, 33 to 330325
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Date             175728 non-null  datetime64[ns]
 1   Country/Region   175728 non-null  object        
 2   Confirmed_Cases  175728 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.4+ MB
