In [1]:
# importing the necessary libraries
import numpy as np  # For array handling and numerical operations
import pandas as pd  # For creating DataFrames and data manipulation
import re  # For cleaning raw or scraped data

In [2]:
# importing the scrabed data set which has created in the b_data_collection file and saved
df=pd.read_csv("scraped_data.csv")
#checking the data successfuly loaded or not
print(df.head())

           country          cases        death           region
0   United States    111,820,082    1,219,487    North America 
1           India     45,035,393      533,570             Asia 
2          France     40,138,560      167,642           Europe 
3         Germany     38,828,995      183,027           Europe 
4          Brazil     38,743,918      711,380    South America 


# Data Cleaning

In [4]:
# we got only one hidden string in the region column in data exploration part so clearing the hidden string present in the region column of the dataframe
df["region"] = df["region"].replace(r'^\s*$', np.nan, regex=True)#replacing the hiddden string with nan for future purpose use
df["region"]=df['region'].fillna("no region")#rplacing the nan with requird value you need

In [5]:
#handleing the delimeter , in the integer column vlaues and typecasting the columns from object to actual type
df["country"]=df["country"].astype("string")
df["cases"]=df["cases"].str.replace(',','').astype(int)
df["death"]=df["death"].str.replace(',','').astype(int)
df["region"]=df["region"].astype("string")

In [10]:
#adding a new column of the datafrmae that is morality rate of covid 19
df["moralityrate"]=(df["cases"]/df["death"])*100

# Sort the df  by number of cases in descending order so that the data frame will show number of cases according
df.sort_values(by="cases",ascending=False,inplace=True)
#verifying all the clean part has done or not
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       230 non-null    string 
 1   cases         230 non-null    int64  
 2   death         230 non-null    int64  
 3   region        230 non-null    string 
 4   moralityrate  230 non-null    float64
dtypes: float64(1), int64(2), string(2)
memory usage: 9.1 KB
None


In [11]:
#it seems like there are inf and e+ notations number in coumn morality  so handling the inf by replace it with 0 and e+ notations with float point
df["moralityrate"]=df["moralityrate"].replace([float('inf')],0)#this also clear the e+ notation float

In [12]:
#checking is all inf cleared or not in morality column
print(df["moralityrate"].apply(np.isinf).sum())

0


In [13]:
#checking is there any e+ notations or not
e_plus_present = df["moralityrate"].apply(lambda x: 'e+' in str(x)).any()
print(e_plus_present)

False


In [14]:
#Rounding off the moralityrate with 2flaot numeric
df["moralityrate"] = df["moralityrate"].round(2)

In [15]:
# saving the data set for future analysis into a csv
df.to_csv("Covid-19 countrywise cleand dataset.csv",index=False)