<a href="https://colab.research.google.com/github/ssoma2mc/Data110/blob/main/Copy_of_ShinkoSoma_Project1_Task3_DataVisualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data 110 Data Visualization
# Project 1: Task 3 Data Visualization
- Create a minimum of three visualizations that highlight your findings. These can include, but are not limited to, scatter plots, line charts, histograms, bar charts, and other that we coverd in class.
- Each visualization should be accompanied by a caption explaining the insight or trend it is meant to convey.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
from scipy.optimize import curve_fit

In [None]:
sns.set_style("whitegrid")
sns.color_palette("viridis", n_colors=10)

In [None]:
colors = sns.color_palette("viridis", n_colors=10)

In [None]:
file_path = 'https://raw.githubusercontent.com/ssoma2mc/Data110/main/Project_1/life_exp_kaggle_full.csv'
df = pd.read_csv(file_path)

In [None]:
df.rename({'Country Name' : 'country',
           'Country Code' : 'country_code',
           'Region' : 'region',
           'IncomeGroup' : 'income',
           'Year' : 'year',
           'Life Expectancy World Bank' : 'life_expectancy',
           'Prevelance of Undernourishment' : 'under_nourishment',
           'CO2': 'co2',
           'Health Expenditure %' : 'health_expenditure',
           'Education Expenditure %' : 'education_expenditure',
           'Unemployment' : 'unemployment',
           'Corruption' : 'corruption' ,
           'Sanitation' : 'sanitation',
           'Injuries' : 'injuries',
           'Communicable' : 'communicable' ,
           'NonCommunicable': 'non_communicable' ,
           },axis=1,inplace=True)

I worked on the task by answering the following questions:

- A. What is the current life expectancy?
- B. How does life expectancy vary by country and region?
- C. What factors influence life expectancy?
- D. Can we predict the life expectancy from this data?

### A. What is the current Life Expectancy?
**Figure A-1. Global Average Life Expectancy (2001-2019)**

The global average life expectancy increased from 66.67 years in 2001 to 72.59 years in 2019

In [None]:
# A. What is the current life expectancy?
#The global average life expectancy increased from 66.67 years in 2001 to 72.59 years in 2019 (Figure A-1)
colors = sns.color_palette("viridis", n_colors=10)

df_global_avg = df.groupby('year')['life_expectancy'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='life_expectancy', data=df_global_avg, color=colors[5], lw=5, label='Global Average')
plt.title('Figure A-1. Global Average Life Expectancy (2001-2019)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Life Expectancy (years)', fontsize=20)
plt.xticks(range(1998, 2024, 5))
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=18)
plt.grid(True, linestyle='--', alpha=0.6)

life_expectancy_2001 = df_global_avg[df_global_avg['year'] == 2001]['life_expectancy'].values[0]
life_expectancy_2019 = df_global_avg[df_global_avg['year'] == 2019]['life_expectancy'].values[0]

plt.text(2005, life_expectancy_2001 + 1, f'{life_expectancy_2001:.2f}', color=colors[0], fontsize=46, ha='right', va='bottom', fontweight='bold')
plt.text(2016, life_expectancy_2019 - 1, f'{life_expectancy_2019:.2f}', color=colors[0], fontsize=46, ha='left', va='top', fontweight='bold')

plt.show()






**Figure A-2. Distribution of Life Expectancy in 2019**
Life expectancy varies from a minimum of 53.28 years (Japan) to a maximum of 84.36 years (Central African Republic)

In [None]:
# Figure A-2: Distribution of Life Expectancy in 2019
# Life expectancy varies from a minimum of 53.28 years (Japan) to a maximum of 84.36 years (Central African Republic)
stats_2019 = df[df['year'] == 2019]['life_expectancy']

plt.hist(stats_2019, bins=20, edgecolor='white', alpha=0.8, color=colors[2])

plt.title('Figure A-2. Distribution of Life Expectancy in 2019')
plt.xlabel('Life Expectancy')
plt.ylabel('Frequency')

plt.grid(axis='y', linestyle='-', alpha=0.7)
plt.grid(axis='x', linestyle='none')

plt.show()


In [None]:
stats_2019 = df[df['year'] == 2019]['life_expectancy']
print(stats_2019.describe())

In [None]:
stats_2019 = df[df['year'] == 2019]
top_5_countries = stats_2019.sort_values(by='life_expectancy', ascending=False).head(5)
bottom_5_countries = stats_2019.sort_values(by='life_expectancy', ascending=True).head(5)

print("Top 5 countries with highest life expectancy:")
print(top_5_countries[['country', 'life_expectancy']])
print("\nBottom 5 countries with lowest life expectancy:")
print(bottom_5_countries[['country', 'life_expectancy']])

**Figure A-4. Average Life Expectancy by Region in 2019**

There are significant regional differences in life expectancy.

In [None]:
df_2019 = df[df['year'] == 2019].dropna(subset=['country_code', 'life_expectancy'])

fig = px.choropleth(df_2019,
                    locations='country_code',
                    color='life_expectancy',
                    hover_name='country',
                    color_continuous_scale='RdYlBu',
                    title='Life Expectancy by Country in 2019')

fig.update_layout(geo=dict(showframe=False, showcoastlines=False, projection_type='equirectangular'))
fig.show()


**Figure A-4. Average Life Expectancy by Region in 2019**

North America has the highest life expectancy at 80.90 years, while Sub-Saharan Africa has the lowest life expectancy at 62.72 years.

In [None]:
df_grouped = df.groupby('region')['life_expectancy'].mean().reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x='life_expectancy', y='region', data=df_grouped, palette="viridis", hue='region', legend=False)

for i, row in df_grouped.iterrows():
    ax.text(row['life_expectancy'] - 3.5, i, f'{row["life_expectancy"]:.2f}',
            color='white', ha='center', va='center', fontsize=16)

plt.title('Average Life Expectancy by Region in 2019', fontsize=16)
plt.xlabel('Average Life Expectancy (years)', fontsize=12)
plt.ylabel('Region', fontsize=12)

plt.show()




**Figure A-5. Average Life Expectancy by Region (2001-2019)**

Although there are regional differences, life expectancy shows an upward trend in all regions.

In [None]:
# Figure A-5. Average Life Expectancy by Region (2001-2019)
# Although there are regional differences, life expectancy shows an upward trend in all regions.

df_region_avg = df.groupby(['year', 'region'])['life_expectancy'].mean().reset_index()
palette = sns.color_palette("viridis", n_colors=len(df_region_avg['region'].unique()))

plt.figure(figsize=(10, 6))
sns.lineplot(x='year', y='life_expectancy', hue='region', data=df_region_avg, lw=2, palette=palette)
plt.title('Figure A-5. Average Life Expectancy by Region (2001-2019)', fontsize=20)
plt.xlabel('Year', fontsize=12)
plt.xticks(range(1998, 2024, 5))
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=18)
plt.grid(True, linestyle='--', alpha=0.6)

for region in df_region_avg['region'].unique():
    df_region_2001 = df_region_avg[(df_region_avg['region'] == region) & (df_region_avg['year'] == 2001)]
    df_region_2019 = df_region_avg[(df_region_avg['region'] == region) & (df_region_avg['year'] == 2019)]

    life_expectancy_2001 = df_region_2001['life_expectancy'].values[0]
    life_expectancy_2019 = df_region_2019['life_expectancy'].values[0]

    plt.text(2000.5, life_expectancy_2001, f'{life_expectancy_2001:.2f}', color='black', fontsize=12, ha='right', va='center')
    plt.text(2019.5, life_expectancy_2019, f'{life_expectancy_2019:.2f}', color='black', fontsize=12, ha='left', va='center')

plt.show()


# 3. Data Visualization
## C. What factors influence life expectancy?

I conduct the order in the following order.
- I. Pair plots to observe the overview.
- II. Close Examination the relationship between each predictor variable and life expectancy.
-i) categorical variables
      a) Income group
      b) Region
-ii) numerical variables
      c) Under Nourishment
      (d) CO2)
      e) Health Expenditure
      f) Education Expenditure
      g) Unemployment
      h) Corruption
      i) Sanitation
      j) Injuries
      k) Communicable
      l) Non-Communicable
- iii) d) CO2

**Figure C-1. Pair plots to observe the overview**

By using the pair plot, we were able to roughly capture the relationships between our 10 predictor variables (prevalence of undernutrition, CO2 emissions, health expenditure %, education expenditure %, unemployment, corruption, sanitation, injuries, communicable diseases, and non-communicable diseases*), all of which influence life expectancy.

*'corrupition' was excluded becasue it contains too many Nan.

In [None]:
# Figure C-1. Pair plot for overview
# By using the pair plot, we were able to roughly capture the relationships between our 10 predictor variables
#(prevalence of undernutrition, CO2 emissions, health expenditure %, education expenditure %, unemployment, corruption, sanitation, injuries, communicable diseases, and non-communicable diseases*), all of which influence life expectancy (Figure C-1).
#*'corrupition' was excluded becasue it contains too many Nan.

df_cleaned = df.dropna(subset=['country_code', 'region', 'income', 'life_expectancy',
                               'under_nourishment', 'co2', 'health_expenditure', 'education_expenditure',
                               'unemployment', 'sanitation', 'injuries',
                               'communicable', 'non_communicable']).copy()

sns.pairplot(df_cleaned[['region','income','life_expectancy', 'under_nourishment', 'co2', 'health_expenditure',
                         'education_expenditure', 'unemployment', 'sanitation',
                         'injuries', 'communicable', 'non_communicable']])
#plt.title('Pairplot of Selected Variables', fontsize=16)
plt.show()


**Figure C-2. Pair plot by income group**

Additionally, when examining pair plots by each income group, we observed distinct clusters, indicating groupings within the data.

In [None]:
# Figure C-2. Pair plot by income group
# Additionally, when examining pair plots by each income group, we observed distinct clusters, indicating groupings within the data.
sns.pairplot(df_cleaned[['region','income','life_expectancy', 'under_nourishment', 'co2', 'health_expenditure',
                         'education_expenditure', 'unemployment', 'sanitation',
                         'injuries', 'communicable', 'non_communicable']], hue = "income", diag_kind = "kde")
plt.show()

**Figure C-3. Pair plot by income group**

Additionally, when examining pair plots by each region, we observed distinct clusters, indicating groupings within the data.

In general, region (and race) can serve as predictors, but they are secondary. These factors influence life expectancy due to social determinants such as access to healthcare, socioeconomic status, and education. Therefore, in this analysis, I focused on examining income groups.

In [None]:
# hue = "region"
sns.pairplot(df_cleaned[['region','income','life_expectancy', 'under_nourishment', 'co2', 'health_expenditure',
                         'education_expenditure', 'unemployment', 'sanitation',
                         'injuries', 'communicable', 'non_communicable']], hue = "region", diag_kind = "kde")

plt.show()

I conduct the order in the following order.
- I. Pair plots to observe the overview.
- II. Close Examination the relationship between each predictor variable and life expectancy.
-i) categorical variables
      a) Income group

**Figure C-4. The impact of income group on life expectancy**
In the low-income group, life expectancy is 56.56 years. As income increases, life expectancy rises, reaching 78.85 years in the high-income group.

In [None]:
# Figure C-4. The impact of income group on life expectancy
# In the low-income group, life expectancy is 56.56 years.
# As income increases, life expectancy rises, reaching 78.85 years in the high-income group.

df_cleaned = df.dropna(subset=['income', 'life_expectancy']).copy()

income_order = df_cleaned.groupby('income')['life_expectancy'].median().sort_values().index

plt.figure(figsize=(10, 6))

sns.boxplot(data=df_cleaned, x='income', y='life_expectancy', order=income_order, palette='viridis', hue='income', legend=False)

plt.xlabel('Income Group', labelpad=30)
plt.ylabel('Life Expectancy')
plt.title('Figure C-4.The impact of income group on life expectancy', fontsize=20)

income_labels = ['Low', 'Lower Middle', 'Upper Middle', 'High']
plt.xticks(range(len(income_labels)), income_labels)

# Calculate statistics to annotate the boxplot
for i, income_group in enumerate(income_order):
    group_data = df_cleaned[df_cleaned['income'] == income_group]['life_expectancy']

    median = np.median(group_data)
    q1 = np.percentile(group_data, 25)
    q3 = np.percentile(group_data, 75)

    #plt.plot([i - 0.2, i + 0.2], [median, median], color='black', lw=2, label='Median' if i == 0 else "")
    #plt.plot([i - 0.2, i + 0.2], [q1, q1], color= 'black', lw=2, label='Q1' if i == 0 else "")
    #plt.plot([i - 0.2, i + 0.2], [q3, q3], color= 'black', lw=2, label='Q3' if i == 0 else "")
    plt.text(i + 0.405, median, f'{median:.2f}', ha='left', va='center', fontsize=10, color='black')
    plt.text(i + 0.405, q1, f'{q1:.2f}', ha='left', va='center', fontsize=10, color='black')
    plt.text(i + 0.405, q3, f'{q3:.2f}', ha='left', va='center', fontsize=10, color='black')
#plt.legend(title="Statistics")
plt.tight_layout()
plt.show()

    b) Region

**Figure C-5. The impact of region on life expectancy**

There are regional differences in life expectancy. North America has the highest life expectancy at 80.90 years, while Sub-Saharan Africa has the lowest at 62.72 years.


In [None]:
# Figure C-5. The impact of region on life expectancy
# There are regional differences in life expectancy.
# North America has the highest life expectancy at 80.90 years, while Sub-Saharan Africa has the lowest at 62.72 years.

df_cleaned = df.dropna(subset=['region', 'life_expectancy']).copy()

region_order = df_cleaned.groupby('region')['life_expectancy'].median().sort_values().index

plt.figure(figsize=(9, 8))

sns.boxplot(data=df_cleaned, x='region', y='life_expectancy', order=region_order, palette='viridis', hue='region', legend=False)

plt.xlabel('Region', labelpad=30)
plt.ylabel('Life Expectancy')
plt.title('Figure C-5. The Impact of Region on Life Expectancy', fontsize=20)
plt.xticks(rotation=50)

for i, region in enumerate(region_order):
    group_data = df_cleaned[df_cleaned['region'] == region]['life_expectancy']

    median = np.median(group_data)
    q1 = np.percentile(group_data, 25)
    q3 = np.percentile(group_data, 75)

    plt.text(i + 0.05, median + 1, f'{median:.2f}', ha='left', va='center', fontsize=10, color='black')  # メディアンは少し上
    plt.text(i + 0.05, q1 -2, f'{q1:.2f}', ha='left', va='center', fontsize=10, color='black')      # Q1は少し下
    plt.text(i + 0.05, q3 + 2, f'{q3:.2f}', ha='left', va='center', fontsize=10, color='black')      # Q3はさらに少し下

plt.tight_layout()
plt.show()



- ii) Numerical Variables

    c) Under Nourishment
    
**Figure C-6. The impact of undernutrition on life expectancy**
As undernutrition increases, life expectancy decreases for all of the income group.

In [None]:
# Figure C-6. The impact of undernutrition on life expectancy
# As undernutrition increases, life expectancy decreases for all of the income group.

sns.scatterplot(data=df_cleaned, x='under_nourishment', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x="under_nourishment", y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['under_nourishment'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Undernutrition + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['under_nourishment']) + 20, max(df_cleaned['life_expectancy']) - 5, equation_text, fontsize=8, color="black")

plt.xlabel('Undernutrition')
plt.ylabel("Life Expectancy")
plt.title("Figure C-6.The impact of undernutrition on life expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()



In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]

    data_income_group = df_cleaned[df_cleaned['income'] == income_group]

    sns.scatterplot(data=data_income_group, x='under_nourishment', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)

    sns.regplot(data=data_income_group, x="under_nourishment", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['under_nourishment'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Undernutrition + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    #ax.text(65, 80, equation_text, fontsize=8, color="black", ha='right', va='top')

    ax.set_xlabel('Undernutrition')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    #ax.legend(loc='upper right')
    ax.set_xlim(0, 70)  # x-axis from 0 to 70
    ax.set_ylim(40, 85)  # y-axis from 40 to 85
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


    e) Health Expenditure
    
**Figure C-7. The Impact of Health Expenditure on Life Expectancy**

As health expenditure increases, life expectancy also rises. Specifically, in the high-income group, an increase in health expenditure leads to a significant rise in life expectancy, while in the low-income group, the increase is minimal.

In [None]:
# Figure C-7. The Impact of Health Expenditure on Life Expectancy
# As health expenditure increases, life expectancy also rises. Specifically, in the high-income group,
# an increase in health expenditure leads to a significant rise in life expectancy, while in the low-income group, the increase is minimal.

df_cleaned = df.dropna(subset=['health_expenditure','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='health_expenditure', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='health_expenditure', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['health_expenditure'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Health Expenditure + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['health_expenditure']) + 22, max(df_cleaned['life_expectancy']) - 20, equation_text, fontsize=8, color="black")

plt.xlabel('Health Expenditure')
plt.ylabel("Life Expectancy")
plt.title("Figure C-7. The Impact of Health Expenditure on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_cleaned[df_cleaned['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='health_expenditure', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="health_expenditure", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")
    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['health_expenditure'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Health Expenditure + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    #ax.text(min(data_income_group['health_expenditure']) + 2.8, max(data_income_group['life_expectancy']) - 38,
    #        equation_text, fontsize=8, color="black")
    ax.set_xlabel('Health Expenditure')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    #ax.legend(loc='upper right')
    ax.set_xlim(0, df_cleaned['health_expenditure'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


    f) Education Expenditure

**Figure C-8. The impact of Education Expenditure on Life Expectancy**

As education expenditure increases, life expectancy also rises. Specifically, in the low-income group, an increase in education expenditure leads to a significant rise in life expectancy,
while in the high-income group, the increase is minimal.


In [None]:
# Figure C-8. The impact of Education Expenditure on Life Expectancy
#As education expenditure increases, life expectancy also rises. Specifically, in the low-income group,
#an increase in education expenditure leads to a significant rise in life expectancy,
#while in the high-income group, the increase is minimal.

df_cleaned = df.dropna(subset=['education_expenditure','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='education_expenditure', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='education_expenditure', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['education_expenditure'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Education Expenditure + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['education_expenditure']) + 15, max(df_cleaned['life_expectancy']) - 20, equation_text, fontsize=8, color="black")

plt.xlabel('Education Expenditure')
plt.ylabel("Life Expectancy")
plt.title("Figure C-8. The impact of Education Expenditure on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_cleaned[df_cleaned['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='education_expenditure', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="education_expenditure", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")
    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['education_expenditure'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Education Expenditure + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    #ax.text(min(data_income_group['education_expenditure']) + 25, max(data_income_group['life_expectancy']) - 5,
    #        equation_text, fontsize=8, color="black")
    ax.set_xlabel('Education Expenditure')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    #ax.legend(loc='upper right')
    ax.set_xlim(0, df_cleaned['education_expenditure'].max())  # x-axis from 0 to max education_expenditure
    ax.set_ylim(40, 85)  # y-axis from 40 to 85
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


    g) Unemployment

**Figure C-9. The impact of Unemployment on Life Expectancy**

As unemployment increases, life expectancy decreases among all of the income group.

In [None]:
# Figure C-9.  The impact of Unemployment on Life Expectancy
# As unemployment increases, life expectancy decreases among all of the income group.

df_cleaned = df.dropna(subset=['unemployment','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='unemployment', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='unemployment', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['unemployment'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Unemployment + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['unemployment']) + 5, max(df_cleaned['life_expectancy']) - 45, equation_text, fontsize=8, color="black")

plt.xlabel('Unemployment')
plt.ylabel("Life Expectancy")
plt.title("Figure C-9. The impact of Unemployment on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_cleaned[df_cleaned['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='unemployment', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="unemployment", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['unemployment'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Unemployment + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    ax.text(min(data_income_group['unemployment']) + 10,
            max(data_income_group['life_expectancy']) - 25,
            equation_text, fontsize=8, color="black")

    ax.set_xlabel('Unemployment')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_xlim(min(df_cleaned['unemployment']), df_cleaned['unemployment'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()



    h) Corruption

**Figure C-10. The impact of Corrupyion on Life Expectancy**

When the corruption scale is low (meaning corruption is severe), life expectancy decreases.

- 'corruption' is numerical data, but it is a scale. It has an order. I think that it should be treated as a categorical variable.

- The World Bank Governance Indicators assess governance and corruption on a scale from -2.5 to +2.5. The Corruption Perceptions Index (CPI) specifically focuses on 'perceptions of corruption' and is evaluated on a scale from 0 to 100. It is unclear whether the Kaggle data is based on the World Bank Governance Indicators or CPI, but if the scale converges from 1 to 5, it is highly likely that this data is an indicator of the level of corruption. It can be interpreted as follows:
-- 1: High level of corruption
-- 3: Slightly fewer corruption issues  
-- 5: Nearly no corruption

In [None]:
# Figure C-10. The impact of Corrupyion on Life Expectancy
# When the corruption scale is low (meaning corruption is severe), life expectancy decreases.

sns.violinplot(data=df_cleaned, x='corruption', y='life_expectancy', hue='corruption', palette='viridis', legend=False)

plt.xlabel('Corruption')
plt.ylabel("Life Expectancy")
plt.title("Figure C-10. The impact of Corruption on Life Expectancy")

plt.show()


**Figure C-11. The impact of Sanitation on Life Expectancy**

Overall, as sanitation improves, life expectancy increases. This is particularly noticeable in the lower middle-income group, while it remains stable in the upper middle-income group, and life expectancy decreases in the low-income group .

In [None]:
#Figure C-11. The impact of Sanitation on Life Expectancy
# Overall, as sanitation improves, life expectancy increases.
# This is particularly noticeable in the lower middle-income group,
# while it remains stable in the upper middle-income group, and life expectancy decreases in the low-income group .

df_cleaned = df.dropna(subset=['sanitation','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='sanitation', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='sanitation', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['sanitation'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Sanitation + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['sanitation']) + 40, max(df_cleaned['life_expectancy']) - 40, equation_text, fontsize=8, color="black")

plt.xlabel('Sanitation')
plt.ylabel("Life Expectancy")
plt.title("Figure C-11. The impact of Sanitation on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']
color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_cleaned[df_cleaned['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='sanitation', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="sanitation", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['sanitation'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Sanitation + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    ax.text(min(data_income_group['sanitation']) + 10,
            max(data_income_group['life_expectancy']) - 25,
            equation_text, fontsize=8, color="black")

    ax.set_xlabel('Sanitation')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_xlim(min(df_cleaned['sanitation']), df_cleaned['sanitation'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()





**Figure C-12. The impact of Injuries on Life Expectancy**

Excluding countries that are believed to be in war zones, where injuries are extremely high, an increase in injuries leads to a slight rise in life expectancy.

In [None]:
# Figure C-12. The impact of Injuries on Life Expectancy
# Excluding countries that are believed to be in war zones, where injuries are extremely high,
# an increase in injuries leads to a slight rise in life expectancy.

df_cleaned = df.dropna(subset=['injuries','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='injuries', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='injuries', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['sanitation'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Injuries + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['injuries']) +20000000, max(df_cleaned['life_expectancy']) - 40, equation_text, fontsize=8, color="black")

plt.xlabel('Injuries')
plt.ylabel("Life Expectancy")
plt.title("Figure C-12. The impact of Injuries on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
# Exclude contries injuries are extremely high

df_filtered = df_cleaned[(df_cleaned['injuries'] < 10000000) ].copy()

sns.scatterplot(data=df_filtered, x='injuries', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='injuries', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['sanitation'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Injuries + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_filtered['injuries']) +20000000, max(df_filtered['life_expectancy']) - 40, equation_text, fontsize=8, color="black")

plt.xlabel('Injuries')
plt.ylabel("Life Expectancy")
plt.title("The impact of Injuries on life expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']
color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_filtered[df_filtered['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='injuries', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="injuries", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['injuries'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Injuries + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    #ax.text(min(data_income_group['injuries']) + 20000000,
    #        max(data_income_group['life_expectancy']) - 40,
    #        equation_text, fontsize=8, color="black")

    ax.set_xlabel('Injuries')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_xlim(min(df_filtered['injuries']), df_filtered['injuries'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


**Figure C-13. Communicable Diseases**

As communicable diseases increase, life expectancy decreases among all of the income groups.

In [None]:
# Figure C-13. Communicable Diseases
# As communicable diseases increase, life expectancy decreases among all of the income groups.

df_cleaned = df.dropna(subset=['communicable','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='communicable', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='communicable', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['sanitation'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Communicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['communicable']) +20000000, max(df_cleaned['life_expectancy']) - 50, equation_text, fontsize=8, color="black")

plt.xlabel('Communicable Diseases')
plt.ylabel("Life Expectancy")
plt.title("Figure C-13.The impact of Communicable Diseases on life expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
# Exclude 'communucable' is expremely high countries

df_filtered = df_cleaned[(df_cleaned['communicable'] < 100000000) ].copy()

sns.scatterplot(data=df_filtered, x='communicable', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='communicable', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['sanitation'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Communicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_filtered['communicable']) +20000000, max(df_filtered['life_expectancy']) - 55, equation_text, fontsize=8, color="black")

plt.xlabel('Communicable Diseases')
plt.ylabel("Life Expectancy")
plt.title("The impact of Communicable on life expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']
color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_filtered[df_filtered['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='communicable', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="communicable", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['communicable'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Communicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    #ax.text(min(data_income_group['communicable']) + 20000000,
    #        max(data_income_group['life_expectancy']) - 25,
    #        equation_text, fontsize=8, color="black")

    ax.set_xlabel('Communicable Diseases')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_xlim(min(df_filtered['communicable']), df_filtered['communicable'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


**Figure C-14. Figure C-14. The impact of Noncommunicable Diseases on Life Expectancy**:

As non-communicable diseases increase, life expectancy decreases among all of the income group.

In [None]:
# Figure C-14. Figure C-14. The impact of Noncommunicable Diseases on Life Expectancy
# As non-communicable diseases increase, life expectancy decreases among all of the income group.

df_cleaned = df.dropna(subset=['non_communicable','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='non_communicable', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='non_communicable', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['sanitation'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Noncommunicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['non_communicable']) +50000000, max(df_cleaned['life_expectancy']) - 35, equation_text, fontsize=8, color="black")

plt.xlabel('Noncommunicable Diseases')
plt.ylabel("Life Expectancy")
plt.title("Figure C-14. The impact of Noncommunicable Diseases on Life Expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
# Exclude contries 'non_communucable'is extremery high

df_filtered = df_cleaned[(df_cleaned['non_communicable'] < 100000000) ].copy()

sns.scatterplot(data=df_filtered, x='non_communicable', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='non_communicable', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['sanitation'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * Noncommunicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_filtered['non_communicable']) +22000000, max(df_filtered['life_expectancy']) - 27, equation_text, fontsize=8, color="black")

plt.xlabel('Noncommunicable Diseases')
plt.ylabel("Life Expectancy")
plt.title("The impact of Noncommunicable on life expectancy")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']
color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_filtered[df_filtered['income'] == income_group]
    sns.scatterplot(data=data_income_group, x='non_communicable', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
    sns.regplot(data=data_income_group, x="non_communicable", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['non_communicable'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * Noncommunicable Diseases + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
    ax.text(min(data_income_group['non_communicable']) + 1100000,
            max(data_income_group['life_expectancy']) - 27,
            equation_text, fontsize=8, color="black")

    ax.set_xlabel('Noncommunicable Diseases')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_xlim(min(df_filtered['non_communicable']), df_filtered['non_communicable'].max())
    ax.set_ylim(40, 85)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()



d) CO2
Recently, the impact of CO2 on life expectancy has been attracting attention. While the involvement of respiratory diseases has been known for some time, it is only recently that CO2(CO2 obserbation value) has been reported as the leading risk factor for coronary diseases, surpassing tobacco and hypertension. In this study, we focused on CO2 and conducted an analysis.

- CO2 Emissions: This refers to the amount of carbon dioxide (CO2) released into the atmosphere, often measured in terms of tons per year. These are typically related to industrial activities, energy consumption, transportation, and other human activities.

- CO2 Observed Values: This refers to the actual measurements or recorded data of CO2 levels in the atmosphere, typically in terms of concentration (e.g., parts per million, ppm), which represents the amount of CO2 present in the atmosphere at a given time.

**C-13a. The impact of CO2 on life expectancy (all)**

I examine the impact of CO2 emissions on life expectancy. When visualizing the effect of CO2 on life expectancy using a scatter plot, the data is left-skewed, which complicates the interpretation. There are a few extreme outliers on the right side that distort the overall shape of the graph

In [None]:
# C-13a. The impact of CO2 on life expectancy (all)
# I examine the impact of CO2 emissions on life expectancy.
# When visualizing the effect of CO2 on life expectancy using a scatter plot,
# the data is left-skewed, which complicates the interpretation.
# There are a few extreme outliers on the right side that distort the overall shape of the graph

df_cleaned = df.dropna(subset=['co2','life_expectancy']).copy()
sns.scatterplot(data=df_cleaned, x='co2', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_cleaned, x='co2', y="life_expectancy", scatter=False, color="gray", label="Regression Line")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_cleaned['co2'], df_cleaned['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
plt.text(min(df_cleaned['co2']) + 2000000 , max(df_cleaned['life_expectancy']) - 30, equation_text, fontsize=8, color="black")

plt.xlabel('CO2')
plt.ylabel("Life Expectancy")
plt.title("C-13a. The impact of CO2 on life expectancy (all)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
#by income groups
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

# Loop through income groups and plot each one
for i, income_group in enumerate(income_groups):
    ax = axes[i]  # Select subplot for each income group

    # Filter data for each income group
    data_income_group = df_cleaned[df_cleaned['income'] == income_group]

    # Scatterplot with manually set colors
    sns.scatterplot(data=data_income_group, x='co2', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)

    # Regression line
    sns.regplot(data=data_income_group, x="co2", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    # Linear regression equation
    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['co2'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'

    # Positioning the text in the upper-right corner under the legend
    #ax.text(10000, 45, equation_text, fontsize=8, color="black", ha='right', va='top')

    # Labels and title
    ax.set_xlabel('CO2 Emissions')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.grid(axis='x')

    # Set legend
    #ax.legend(loc='upper right')

    # Set consistent axis limits
    ax.set_xlim(0, 10000000)  # x-axis from 0 to 70
    ax.set_ylim(40, 85)  # y-axis from 40 to 85

# Adjust layout
plt.tight_layout()
plt.show()

# The values are unusually large, and it is not possible to align the x-axis and y-axis.

In [None]:
# check min & max
print(df_cleaned['co2'].max())
print(df_cleaned['co2'].min())
print(df_cleaned['co2'].head(10))


In [None]:
# histgram
df_cleaned['co2'].hist(bins=50, color=colors[2], alpha=0.7, edgecolor = "black")
plt.title('Distribution of CO2')
plt.xlabel('CO2')
plt.ylabel('Frequency')
plt.title('Distribution of CO2')
plt.grid(axis='x', alpha=0.7)
plt.show()


**Figure C-13b. The impact of CO2 on Life Expectancy (CO2 <= 100,000)**

Initially, removing the extreme outliers did not fully address the issue. Therefore, we focused on the data where CO2 emissions are below 100,000, as a substantial amount of data falls within this range.

As a result, life expectancy exhibits an upward trend, particularly evident in the low-income group.



In [None]:
# Figure C-13b. The impact of CO2 on Life Expectancy (CO2 <= 100,000)
# Initially, removing the extreme outliers did not fully address the issue.
# Therefore, we focused on the data where CO2 emissions are below 100,000, as a substantial amount of data falls within this range.

df_filtered = df_cleaned[(df_cleaned['co2'] <= 100000)].copy()

sns.scatterplot(data=df_filtered, x='co2', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='co2', y="life_expectancy", scatter=False, color="gray", label="Regression Line")

slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['co2'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'

plt.text(min(df_filtered['co2']) + 50000, max(df_filtered['life_expectancy']) - 40, equation_text, fontsize=8, color="black")

plt.xlabel('CO2')
plt.ylabel("Life Expectancy")
plt.title("The impact of CO2 on Life Expectancy (CO2 <= 100,000)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

As a result, life expectancy exhibits an upward trend, particularly evident in the low-income group.



In [None]:
# As a result, life expectancy exhibits an upward trend, particularly evident in the low-income group.

income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

df_filtered = df_cleaned[(df_cleaned['co2'] <= 100000)]

for i, income_group in enumerate(income_groups):
    ax = axes[i]

    data_income_group = df_filtered[df_filtered['income'] == income_group]

    sns.scatterplot(data=data_income_group, x='co2', y='life_expectancy',
                    color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)

    sns.regplot(data=data_income_group, x="co2", y="life_expectancy", scatter=False,
                color="gray", ax=ax, label="Regression Line")

    slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['co2'], data_income_group['life_expectancy'])
    equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'

    ax.text(80000, 45, equation_text, fontsize=8, color="black", ha='right', va='top')
    ax.set_xlabel('CO2 Emissions')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.grid(axis='x')

    # Set legend
    #ax.legend(loc='upper right')

    # Set consistent axis limits
    ax.set_xlim(0, 100000)
    ax.set_ylim(40, 85)

plt.tight_layout()
plt.show()

**Figure C-13c. Figure C-13c. The impact of CO2 on Life Expectancy (100,000 < CO2 <= 400,000)**

Subsequently, we analyzed the range of CO2 emissions from 100,000 to 400,000. We observed that, within the upper-middle-income group, as CO2 emissions increase, life expectancy tends to decrease.

Based on these findings, we can conclude the following:

For CO2 emissions less than or equal to 100,000, life expectancy increases.
For CO2 emissions greater than 100,000, life expectancy tends to decrease.

In [None]:
# Figure C-13c. Figure C-13c. The impact of CO2 on Life Expectancy (100,000 < CO2 <= 400,000)
# Subsequently, we analyzed the range of CO2 emissions from 100,000 to 400,000.
# We observed that, within the upper-middle-income group, as CO2 emissions increase,
# life expectancy tends to decrease.

df_filtered = df_cleaned[(df_cleaned['co2'] > 100000) & (df_cleaned['co2'] <= 400000)].copy()

sns.scatterplot(data=df_filtered, x='co2', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='co2', y="life_expectancy", scatter=False, color="gray", label="Regression Line")

slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['co2'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'

plt.text(min(df_filtered['co2']) + 50000, max(df_filtered['life_expectancy']) - 30, equation_text, fontsize=8, color="black")

plt.xlabel('CO2')
plt.ylabel("Life Expectancy")
plt.title("Figure C-13c. The impact of CO2 on Life Expectancy (100,000 < CO2 <= 400,000)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.show()

In [None]:
income_groups = ['Low income', 'Lower middle income', 'Upper middle income', 'High income']

color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

df_filtered = df_cleaned[(df_cleaned['co2'] > 100000) & (df_cleaned['co2'] <= 400000)]

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_filtered[df_filtered['income'] == income_group]

    if not data_income_group.empty:
        sns.scatterplot(data=data_income_group, x='co2', y='life_expectancy',
                        color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
        sns.regplot(data=data_income_group, x="co2", y="life_expectancy", scatter=False,
                    color="gray", ax=ax, label="Regression Line")
        slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['co2'], data_income_group['life_expectancy'])
        equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
        ax.text(min(data_income_group['co2']) + 50000, max(data_income_group['life_expectancy']) - 30, equation_text, fontsize=8, color="black")

    else:
        ax.set_title(f"No Data for {income_group}")
        ax.set_xlabel('CO2 Emissions')
        ax.set_ylabel("Life Expectancy")

    ax.set_xlabel('CO2 Emissions')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.grid(axis='x')

    # Set legend
    #ax.legend(loc='upper right')

    ax.set_xlim(100000, 400000)
    ax.set_ylim(40, 85)

plt.tight_layout()
plt.show()



**Figure C-13d. The impact of CO2 on Life Expectancy (CO2 > 400,000)**

Above 400,000, the data points become sparse, making it difficult to draw definitive conclusions. However, it appears that as CO2 emissions rise, life expectancy tends to decrease or, at best, shows no significant increase.

In [None]:
# Figure C-13d. The impact of CO2 on Life Expectancy (CO2 > 400,000)
# Above 400,000, the data points become sparse, making it difficult to draw definitive conclusions.
# However, it appears that as CO2 emissions rise, life expectancy tends to decrease or, at best, shows no significant increase.

df_filtered = df_cleaned[(df_cleaned['co2'] > 400000)].copy()

sns.scatterplot(data=df_filtered, x='co2', y='life_expectancy', hue='income', palette='viridis', alpha=0.6, edgecolor="black")

sns.regplot(data=df_filtered, x='co2', y="life_expectancy", scatter=False, color="gray", label="Regression Line")

slope, intercept, r_value, p_value, std_err = stats.linregress(df_filtered['co2'], df_filtered['life_expectancy'])
equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'

plt.text(min(df_filtered['co2']) + 5000000, max(df_filtered['life_expectancy']) - 20, equation_text, fontsize=8, color="black")

plt.xlabel('CO2')
plt.ylabel("Life Expectancy")
plt.title("Figure C-13d. The impact of CO2 on Life Expectancy (CO2 > 400,000)")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x')

plt.xticks(ticks=range(400000, int(df_filtered['co2'].max()) + 1, 2000000),
           labels=[f'{x:,}' for x in range(400000, int(df_filtered['co2'].max()) + 1, 2000000)])

plt.xticks(rotation=45)

plt.show()




In [None]:
color_map = {
    'Low income': colors[0],
    'Lower middle income': colors[2],
    'Upper middle income': colors[5],
    'High income': colors[7]
}

df_filtered = df_cleaned[(df_cleaned['co2'] > 400000)].copy()

fig, axes = plt.subplots(1, 4, figsize=(20, 5))

for i, income_group in enumerate(income_groups):
    ax = axes[i]
    data_income_group = df_filtered[df_filtered['income'] == income_group]

    if not data_income_group.empty:
        sns.scatterplot(data=data_income_group, x='co2', y='life_expectancy',
                        color=color_map[income_group], alpha=0.6, edgecolor="black", ax=ax)
        sns.regplot(data=data_income_group, x="co2", y="life_expectancy", scatter=False,
                    color="gray", ax=ax, label="Regression Line")

        slope, intercept, r_value, p_value, std_err = stats.linregress(data_income_group['co2'], data_income_group['life_expectancy'])
        equation_text = f'Life Expectancy = {slope:.2f} * CO2 + {intercept:.2f}\nR-squared = {r_value**2:.3f}'
        ax.text(400000, 40, equation_text, fontsize=8, color="black", ha='left', va='bottom')
    else:
        ax.set_title(f"No Data for {income_group}")
        ax.set_xlabel('CO2 Emissions')
        ax.set_ylabel("Life Expectancy")

    ax.set_xlabel('CO2 Emissions')
    ax.set_ylabel("Life Expectancy")
    ax.set_title(f"Income: {income_group}")
    ax.set_ylim(40, 85)
    ax.set_xticks(range(400000, int(df_filtered['co2'].max()) + 1, 2000000))
    ax.set_xticklabels([f'{x:,}' for x in range(400000, int(df_filtered['co2'].max()) + 1, 2000000)])
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='x')

plt.tight_layout()
plt.show()


In [None]:
filtered_data_highco2 = df_cleaned[df_cleaned['co2'] >= 400000][['country', 'income', 'co2']]
print(filtered_data_highco2)

In [None]:
filtered_data_highco2_highincome = df_cleaned[(df_cleaned['co2'] >= 400000) & (df_cleaned['income'] == 'High income')][['country', 'income', 'co2']]
print(filtered_data_highco2_highincome)

In [None]:
filtered_data_highco2 = df_cleaned[df_cleaned['co2'] >= 4000000][['country', 'income', 'co2']]
print(filtered_data_highco2)

In [None]:
filtered_data_highco2_highincome = df_cleaned[(df_cleaned['co2'] >= 4000000) & (df_cleaned['income'] == 'High income')][['country', 'income', 'co2']]
print(filtered_data_highco2_highincome)