In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the data
raw_data = pd.read_csv('/kaggle/input/countries-life-expectancy/Life expectancy.csv')
raw_data.head()

## Here's the problem

This dataset lists the countries as a variable (Entity), so they are values on that column. A most appropriate way to organize the dataset is to have them as columns, and the year as the index, or a pseudo index. Let me show what I mean.

In [None]:
# First, let's get all the countries that are listed on the column Entity
raw_data['Entity'].unique()
#df[df['Entity'] == 'Australia']

In [None]:
# Let's create another DataFrame

# Defining the columns of the DataFrame: the countries' names and the year
columns = np.concatenate((np.array(['Year']), raw_data['Entity'].unique()))

# Creating the DataFrame
# We'll use year as the index column for now, so when there are missing values for some countries series, it appears as NaN
df = pd.DataFrame(columns=columns, index=np.sort(raw_data['Year'].unique()))
# For the year column
df['Year'] = np.sort(raw_data['Year'].unique())
# For the countries columns
for country in raw_data['Entity'].unique():
    # In the next line, we're selecting the life expectancy series of a country, dropping the 'Entity' column, setting the index of the series as the 'Year' column
    # and rounding the values to one decimal
    df[country] = raw_data[raw_data['Entity'] == country].drop(['Entity'], axis=1).set_index('Year').round(decimals=1)

# Reseting the index, modifying the data frame and dropping the 'index' column
df.reset_index(inplace=True, drop=True)
df.head()

## Now we're talking

This dataset has separated the countries and assigned a value of life expectancy for each country according to the year. Now we can explore and plot the data.

In [None]:
# Let's check the statistics of the dataset
df.describe()

In [None]:
# Australia is the only one with two missing values, for the years of 1800 and 1801. Let's backfill those values:
df['Australia'].backfill(inplace=True)
df.head()

## What can we do now?

* We can plot and see how the life expectancy evolved the last 200 years.
* Fitting a line doesn't make much sense because we would be predicting the future.
* Clustering the data could give us some insights, such as the difference of rich and poor countries, peaceful vs aggressive, northern hemisfere vs southern hemisfere, etc.

In [None]:
# Let's plot those lines
plt.figure(figsize=(16,8))
for country in df.columns.drop(['Year']):
    plt.plot(df['Year'], df[country])
plt.title("Life expectancy evolution since 1800", fontsize=16, fontweight="bold")
plt.ylabel("Age", fontsize=14)
plt.xlabel("Year", fontsize=14)
plt.legend(df.columns.drop(['Year']))
plt.show()

## What we be observed:

* Up until around 1875 there area straight lines for almost all countries, which probably means this data was generated.
* In general, the life expectancy has pretty much doubled, from a mean of 35 to a mean above 70.
* After 1900, some countries suffered huge drops of life expectancy in some periods. We'll try to understand why.

In [None]:
# Plotting the main countries that participated in World War I and World War II.
plt.figure(figsize=(16,8))
for country in ['France', 'Germany', 'Italy', 'Japan', 'Russia', 'United Kingdom', 'United States']:
    plt.plot(df['Year'], df[country])
plt.title("Life expectancy evolution since 1800", fontsize=16, fontweight="bold")
plt.ylabel("Age", fontsize=14)
plt.xlabel("Year", fontsize=14)
plt.xlim([1900, 2020])
plt.legend(['France', 'Germany', 'Italy', 'Japan', 'Russia', 'United Kingdom', 'United States'])
plt.show()

## As expected:

There are big drops around the periods of war: 1914 - 1918 and 1939 - 1945.

* For the period of World War II, the US and UK doesn't show significant drops, which seems strange to me.
* Russia is the one that suffered the most, counting more than 20 million deaths.
* Following the war, the Cold War period had combats that didn't kill nearly as many as the World War II, so we can observe a steady growth of the life expectancy.
