In [None]:
# Let's import the data into a dataframe so that we can manipulate it
import numpy as np
import pandas as pd

df_raw = pd.read_csv('../input/infant-mortality/InfantMortalityRate.csv')
df_raw.head()

# Initial analysis

In [None]:
# As first quick analysis, let's run pandas_profiling and pandas_visual_analysis to show what is in the raw dataset
import pandas_profiling
pandas_profile = pandas_profiling.ProfileReport(df_raw, progress_bar=False)
pandas_profile.to_widgets()

In [None]:
!pip install pandas_visual_analysis --quiet
from pandas_visual_analysis import VisualAnalysis
VisualAnalysis(df_raw, categorical_columns=["Country", "Gender"])

# Cleaning

In [None]:
# Looking at the warnings from pandas_profiling, we seem to have duplicate rows. Looking at the rows (see relevant tab in pandas_profiling), we suspect these are just errors in the data so we can drop the duplicate and keep the first occurences
# We also have missing values in the last row, most likely due to a /n before the EOF so we can also just drop that row
# It seems year was loaded as a real value while all values as integers so let's convert the type as such. Similar for gender which we convert to category
df_clean = df_raw.drop_duplicates().dropna().astype({"Year": "int", "Gender" : "category"})

# If we further verify, we see there are additional duplicates which were not shown before as their mortality rate is different
print("Showing row duplicates with different mortality rates")
print(df_clean[df_clean.duplicated(["Country", "Gender", "Year"], keep=False)])
# However,they are still for the same country, same year, same gender and thus cleary overlap
# Without having additional details on the dataset, let's just keep the first occurence for now (other alternatives could be min, max, mean, last, ...)
df_clean = df_clean.drop_duplicates(subset=["Country", "Gender", "Year"])

# Lastly, let's set the index to the dimensions we have. Note that we only have 1 value column in this dataset
df_clean = df_clean.set_index(["Country", "Gender", "Year"]).sort_index()
df_clean.head()

# Analysis

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Let's first complete the tasks set forward in the dataset
# Top 5 Countries with lowest mortality rate both male and female
# Top 5 Countries with highest mortality rate both male and female

# We can easily generate a pivot with the top 5 per gender per year
pd.set_option('display.max_rows', None) # We set this to none so that it prints all rows for us. Default is 10
print("----- Top 5 Countries with highest mortality rate both male and female -----")
print(df_clean[df_clean.index.get_level_values("Gender")!="Total"]['Infant Mortality Rate'].groupby(['Year', 'Gender'] ).nlargest(5))

# Now we do the same for the bottom 5
print(""); print("----- Top 5 Countries with lowest mortality rate both male and female -----")
print(df_clean[df_clean.index.get_level_values("Gender")!="Total"]['Infant Mortality Rate'].groupby(['Year', 'Gender'] ).nsmallest(5))

pd.set_option('display.max_rows', 10) # Let's set this back to default for now so we don't generate huge prints when not needed

In [None]:
# Visualize the trend in mortality rate over the years for various countries

# Let's already just plot the "Total" gender across time
df_total = df_clean[df_clean.index.get_level_values("Gender")=="Total"]
fig = px.line(df_total, x=df_total.index.get_level_values('Year'), y='Infant Mortality Rate', color=df_total.index.get_level_values('Country'))
fig.show()
# As you can see, there is not a lot we can learn from this graph due to the sheer amount of lines.
# However, if you filter the countries (by double-clicking them on the legend) you can still investigate how specific countries trend across the years

# Let's use a violin plot and add max-mean-min traces to try and make the graph more insightful. This already gives us a slightly better overview
fig3 = px.violin(df_clean.reset_index(), y="Infant Mortality Rate", x="Year", color="Gender", box=True, points="all", hover_data=df_clean.columns)
fig3.add_trace(go.Scatter(x=df_clean.groupby('Year').max().index, y=df_clean.groupby('Year').max()['Infant Mortality Rate'], mode='lines', line={'width': 4, 'color': 'black'}, name='Maximum'))
fig3.add_trace(go.Scatter(x=df_clean.groupby('Year').min().index, y=df_clean.groupby('Year').min()['Infant Mortality Rate'], mode='lines', line={'width': 4, 'color': 'green'}, name='Minimum'))
fig3.add_trace(go.Scatter(x=df_clean.groupby('Year').mean().index, y=df_clean.groupby('Year').mean()['Infant Mortality Rate'], mode='lines', line={'width': 4, 'color': 'blue'}, name='Mean'))
fig3.show()

In [None]:
# Comparison of trend of mortality rate for a year for both male and female.

# Let's convert the previous violin graph to an animation so that we can look at the detailed comparison across genders for a specific year
fig = px.violin(df_clean.reset_index(), y="Infant Mortality Rate", animation_frame="Year", color="Gender", box=True, points="all", hover_data=df_clean.columns)
fig.show()

# Additionaly let's analyze the delta between Male/Female and Total so that we can zoom in on the difference between the genders
df_gendercompare = pd.DataFrame(df_clean.reset_index().pivot(index=["Year", "Country"], columns=["Gender"],values="Infant Mortality Rate").to_records())
df_gendercompare["delta_female"] = df_gendercompare["Female"] - df_gendercompare["Total"]
df_gendercompare["delta_male"] = df_gendercompare["Male"] - df_gendercompare["Total"]
df_gendercompare = pd.melt(df_gendercompare, id_vars=["Year", "Country"], value_vars=["delta_female", "delta_male"], var_name='Gender', value_name='delta')
fig2 = px.violin(df_gendercompare, y="delta", x="Year", color="Gender", box=True, points="all", hover_data=df_gendercompare.columns)
fig2.show()
# We can see that although females have generally had a lower mortality rate, the difference has been declining over the years