# Data Cleaning Notebook
This notebook documents the cleaning process for the COVID-19 datasets. The goal is to ensure the data is clean, consistent, and ready for analysis.


In [2]:
#improrting the necessary Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

## Loading the Datasets


In [3]:
#defining the path for the csv files 

confirmed_cases_path = r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\raw\time_series_covid19_confirmed_global.csv"
deaths_path = r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\raw\time_series_covid19_deaths_global.csv"
recovered_path = r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\raw\time_series_covid19_recovered_global.csv"
vaccinations_path = r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\raw\vaccinations.csv"

#converting the csv into the pandas dataframe 
confirmed_cases = pd.read_csv(confirmed_cases_path)
deaths = pd.read_csv(deaths_path)
recovered = pd.read_csv(recovered_path)
vaccinations = pd.read_csv(vaccinations_path)


In [4]:
# Displaying the first few rows and summary of each dataset
confirmed_cases.info()
print(confirmed_cases.head())
deaths.info()
print(deaths.head())
recovered.info()
print(recovered.head())
vaccinations.info()
print(vaccinations.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Columns: 1147 entries, Province/State to 3/9/23
dtypes: float64(2), int64(1143), object(2)
memory usage: 2.5+ MB
  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  \
0            NaN    Afghanistan  33.93911  67.709953        0        0   
1            NaN        Albania  41.15330  20.168300        0        0   
2            NaN        Algeria  28.03390   1.659600        0        0   
3            NaN        Andorra  42.50630   1.521800        0        0   
4            NaN         Angola -11.20270  17.873900        0        0   

   1/24/20  1/25/20  1/26/20  1/27/20  ...  2/28/23  3/1/23  3/2/23  3/3/23  \
0        0        0        0        0  ...   209322  209340  209358  209362   
1        0        0        0        0  ...   334391  334408  334408  334427   
2        0        0        0        0  ...   271441  271448  271463  271469   
3        0        0        0        0  ...    47866   

### Handling the missing values and cleaning the Datasets

In [5]:
#cleaning procedure for the confirmed,deaths and recovered

#filling the missing province/state values 

for df in [confirmed_cases, deaths, recovered]:
    df['Province/State'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Province/State'].fillna('Unknown', inplace=True)


In [6]:
#melting the date column 
for df in [confirmed_cases, deaths, recovered]:
    df_long = df.melt(
        id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
        var_name='Date', value_name='Cumulative_Count'
    )


In [7]:
#converting the date to the datetime format 
df_long['Date'] = pd.to_datetime(df_long['Date'])


  df_long['Date'] = pd.to_datetime(df_long['Date'])


In [8]:
#computing the daily chane 
df_long['Daily_Change'] = df_long.groupby(['Province/State', 'Country/Region'])['Cumulative_Count'].diff().fillna(0)


In [9]:
#cleaning the vaccination data 

#converting the vaccination date to the datetime format 
vaccinations['date'] = pd.to_datetime(vaccinations['date'])


In [10]:
#filling the missing columns 
fill_columns = ['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']
vaccinations[fill_columns] = vaccinations[fill_columns].fillna(0)


In [11]:
#dropping the unnecessary columns if they exist
if 'iso_code' in vaccinations.columns:
	vaccinations.drop(columns=['iso_code'], inplace=True)


#### Saving the cleaned Datasets 

In [15]:
#saving the cleanded data 
confirmed_cases.to_csv(r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\processed\cleaned_confirmed_cases.csv")
deaths.to_csv(r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\processed\cleaned_deaths.csv")
recovered.to_csv(r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\processed\cleaned_recovered.csv")
confirmed_cases.to_csv(r"C:\Users\sanga\Documents\COVID19_EDA_Project\data\processed\cleaned_vaccinations.csv")
