# 00. Importing Libraries

# This script includes

# 01. df_income_group data wrangling and cleaning steps

# 02. df_country_region data wrangling and cleaning steps

# 03. Exporting cleaned datasets


In [1]:
# 00. Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# creating path folder 
path = r'C:\Users\cpaul\Documents\Career Foundry\Causes of deaths other than diseases'

In [7]:
# 01. Importing dataset, income_group.csv to create df_income_group

df_income_group = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'income_group.csv'), encoding = "ISO-8859-1", index_col = False)

In [8]:
df_income_group.head(20)

Unnamed: 0,country_code,country,year,income_group
0,AFG,Afghanistan,1987,L
1,AFG,Afghanistan,1988,L
2,AFG,Afghanistan,1989,L
3,AFG,Afghanistan,1990,L
4,AFG,Afghanistan,1991,L
5,AFG,Afghanistan,1992,L
6,AFG,Afghanistan,1993,L
7,AFG,Afghanistan,1994,L
8,AFG,Afghanistan,1995,L
9,AFG,Afghanistan,1996,L


In [9]:
df_income_group.shape

(7845, 4)

In [10]:
df_income_group.describe()

Unnamed: 0,year
count,7845.0
mean,2004.49369
std,10.385917
min,1987.0
25%,1995.0
50%,2004.0
75%,2013.0
max,2022.0


In [None]:
# Not renaming any columns

In [11]:
df_income_group.dtypes

country_code    object
country         object
year             int64
income_group    object
dtype: object

In [12]:
#check for mixed types
for col in df_income_group.columns.tolist():
  weird = (df_income_group[[col]].applymap(type) != df_income_group[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_income_group[weird]) > 0:
    print (col)

In [None]:
# No mixed type columns found

In [16]:
#Changing the datatype of the variable country, country_code

df_income_group['country'] = df_income_group['country'].astype('str')
df_income_group['country_code'] = df_income_group['country_code'].astype('str')
df_income_group['income_group'] = df_income_group['income_group'].astype('str')

In [13]:
#check for missing values
df_income_group.isnull().sum()

country_code    0
country         0
year            0
income_group    0
dtype: int64

In [None]:
# No missing values

In [14]:
# Checking for duplicates
df_income_group_dup = df_income_group[df_income_group.duplicated()]

In [15]:
df_income_group_dup

Unnamed: 0,country_code,country,year,income_group


In [None]:
# No duplicates found

In [18]:
# 02. Importing dataset, UNSD_derived_country_region_data.csv to create df_country_region

df_country_region = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'UNSD_derived_country_region_data.csv'), encoding = "ISO-8859-1", index_col = False)

In [19]:
df_country_region.shape

(248, 4)

In [20]:
df_country_region.head()

Unnamed: 0,ISO-alpha3 Code,Country or Area,Region Name,Sub-region Name
0,DZA,Algeria,Africa,Northern Africa
1,EGY,Egypt,Africa,Northern Africa
2,LBY,Libya,Africa,Northern Africa
3,MAR,Morocco,Africa,Northern Africa
4,SDN,Sudan,Africa,Northern Africa


In [21]:
# Renaming columns

df_country_region.rename(columns={'ISO-alpha3 Code':'iso_code', 'Country or Area':'country', 'Region Name':'region', 'Sub-region Name':'sub_region'}, inplace=True)

In [22]:
df_country_region.head()

Unnamed: 0,iso_code,country,region,sub_region
0,DZA,Algeria,Africa,Northern Africa
1,EGY,Egypt,Africa,Northern Africa
2,LBY,Libya,Africa,Northern Africa
3,MAR,Morocco,Africa,Northern Africa
4,SDN,Sudan,Africa,Northern Africa


In [23]:
df_country_region.dtypes

iso_code      object
country       object
region        object
sub_region    object
dtype: object

In [24]:
#check for mixed types
for col in df_country_region.columns.tolist():
  weird = (df_country_region[[col]].applymap(type) != df_country_region[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_country_region[weird]) > 0:
    print (col)

region
sub_region


In [25]:
#Changing the datatype of the variable region, sub_region , also for country, iso_code

df_country_region['region'] = df_country_region['region'].astype('str')
df_country_region['sub_region'] = df_country_region['sub_region'].astype('str')
df_country_region['country'] = df_country_region['country'].astype('str')
df_country_region['iso_code'] = df_country_region['iso_code'].astype('str')


In [26]:
#check for missing values
df_country_region.isnull().sum()

iso_code      0
country       0
region        0
sub_region    0
dtype: int64

In [None]:
# No missing values found

In [27]:
# Checking for duplicates
df_country_region_dup = df_country_region[df_country_region.duplicated()]

In [28]:
df_country_region_dup

Unnamed: 0,iso_code,country,region,sub_region


In [None]:
# No duplicates found

In [30]:
# Exporting df_causes_of_deaths, df_gdp dataframes

df_income_group.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'income_group_data_checked.csv'))

df_country_region.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'country_region_data_checked.csv'))