In [None]:
pip install numpy pandas matplotlib seaborn 


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Load the dataset
file_path = "../data/life-expectancy-vs-health-expenditure.csv"  # Adjust the file path as necessary
data = pd.read_csv(file_path)

# Display the first few rows to inspect the dataset
print("Dataset preview:")
print(data.head())


In [None]:
# Check column names
print("\nColumns in the dataset:")
print(data.columns)

# Rename columns for consistency
data.rename(columns=lambda x: x.strip().lower().replace(' ', '_').replace(',', ''), inplace=True)

# Display updated column names
print("\nUpdated column names:")
print(data.columns)


In [None]:
# Display basic dataset information
print("\nDataset information:")
print(data.info())

# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())


In [None]:
# Group by 'entity' and calculate min/max for 'life_expectancy_at_birth_total_years'
if 'life_expectancy_at_birth_total_(years)' in data.columns:
    print("\nMin and Max values of 'Life Expectancy at Birth (Years)' by Entity:")
    print(data.groupby('entity')['life_expectancy_at_birth_total_(years)'].agg(['min', 'max']))
else:
    print("\nColumn 'life_expectancy_at_birth_total_years' not found in the dataset.")


In [None]:
# Statistical summary for numerical columns
print("\nStatistical summary of numerical columns:")
print(data.describe())


In [None]:
# Fill missing values in 'continent' with 'Unknown'
data['continent'] = data['continent'].fillna('Unknown')
print("\nFilled missing values in 'continent' column with 'Unknown'.")


In [None]:
# Save the cleaned dataset
output_file_path = "../data/Cleaned_LifeExpectancy.csv"
data.to_csv(output_file_path, index=False)
print(f"\nCleaned dataset saved to: {output_file_path}")


In [None]:
# Visualization Example: Life Expectancy Trend over Years for a Specific Entity
entity_name = "France"
entity_data = data[data['entity'] == entity_name]

if not entity_data.empty:
    plt.figure(figsize=(10, 6))
    plt.plot(entity_data['year'], entity_data['life_expectancy_at_birth_total_(years)'], marker='o', label=entity_name)
    plt.title(f"Life Expectancy Trend for {entity_name}")
    plt.xlabel("Year")
    plt.ylabel("Life Expectancy at Birth (Years)")
    plt.legend()
    plt.grid(True)
    plt.savefig("../visuals/LifeExpectancy_Trend_France.png")
    print(f"Visualization saved as '../visuals/LifeExpectancy_Trend_France.png'")
    plt.show()
else:
    print(f"\nNo data found for the entity: {entity_name}")


In [None]:
# Visualization Example: Life Expectancy Trend over Years for a Specific Entity
entity_name = "Australia"
entity_data = data[data['entity'] == entity_name]

if not entity_data.empty:
    plt.figure(figsize=(10, 6))
    plt.plot(entity_data['year'], entity_data['life_expectancy_at_birth_total_(years)'], marker='o', label=entity_name)
    plt.title(f"Life Expectancy Trend for {entity_name}")
    plt.xlabel("Year")
    plt.ylabel("Life Expectancy at Birth (Years)")
    plt.legend()
    plt.grid(True)
    plt.savefig("../visuals/LifeExpectancy_Trend_Australia.png")
    print(f"Visualization saved as '../visuals/LifeExpectancy_Trend_Australia.png'")
    plt.show()
else:
    print(f"\nNo data found for the entity: {entity_name}")


In [None]:
# Visualization Example: Life Expectancy Trend over Years for a Specific Entity
entity_name = "Austria"
entity_data = data[data['entity'] == entity_name]

if not entity_data.empty:
    plt.figure(figsize=(10, 6))
    plt.plot(entity_data['year'], entity_data['life_expectancy_at_birth_total_(years)'], marker='o', label=entity_name)
    plt.title(f"Life Expectancy Trend for {entity_name}")
    plt.xlabel("Year")
    plt.ylabel("Life Expectancy at Birth (Years)")
    plt.legend()
    plt.grid(True)
    plt.savefig("../visuals/LifeExpectancy_Trend_Austria.png")
    print(f"Visualization saved as '../visuals/LifeExpectancy_Trend_Austria.png'")
    plt.show()
else:
    print(f"\nNo data found for the entity: {entity_name}")


In [None]:
print(data['entity'].unique())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Clean the data: convert non-numeric entries to NaN
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Optionally, you can fill NaN values with a placeholder, e.g., 0 or mean
# data[numeric_columns] = data[numeric_columns].fillna(0)  # To fill with 0s
# data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())  # To fill with column means

# Compute the correlation matrix
correlation_matrix = data[numeric_columns].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Scatter plot of Life Expectancy vs Health Expenditure
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='life_expectancy_at_birth_total_(years)', 
                y='health_expenditure_and_financing_(per_capita)_(oecdstat_(2017))', hue='continent', palette='Set1')
plt.title("Life Expectancy vs Health Expenditure (Per Capita)")
plt.xlabel("Life Expectancy at Birth (Years)")
plt.ylabel("Health Expenditure and Financing (OECDstat)")
plt.legend(title='Continent')
plt.show()


In [None]:
# Box plot of Life Expectancy by Continent
plt.figure(figsize=(12, 6))
sns.boxplot(data=data, x='continent', y='life_expectancy_at_birth_total_(years)', palette='Set2')
plt.title("Life Expectancy by Continent")
plt.xlabel("Continent")
plt.ylabel("Life Expectancy at Birth (Years)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Histogram for Life Expectancy at Birth
plt.figure(figsize=(10, 6))
sns.histplot(data['life_expectancy_at_birth_total_(years)'], kde=True, color='skyblue', bins=20)
plt.title("Distribution of Life Expectancy at Birth")
plt.xlabel("Life Expectancy at Birth (Years)")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Statistical summary of all numeric columns
summary_statistics = data.describe()
print("Statistical Summary of Numeric Columns:")
print(summary_statistics)
output_file_path = "../output/summary_statistics.csv"
summary_statistics.to_csv(output_file_path)

output_file_path


In [None]:
# Group by continent and calculate mean life expectancy and health expenditure
continent_summary = data.groupby('continent')[['life_expectancy_at_birth_total_(years)', 
                                               'health_expenditure_and_financing_(per_capita)_(oecdstat_(2017))']].mean()

print("\nMean Life Expectancy and Health Expenditure by Continent:")
print(continent_summary)
output_file_path = "../output/continent_summary.csv"
continent_summary.to_csv(output_file_path)

output_file_path


In [None]:
# Line plot for Life Expectancy over time, separated by continent
plt.figure(figsize=(12, 6))
sns.lineplot(data=data, x='year', y='life_expectancy_at_birth_total_(years)', hue='continent', marker='o')
plt.title("Life Expectancy Trend Over Time by Continent")
plt.xlabel("Year")
plt.ylabel("Life Expectancy at Birth (Years)")
plt.legend(title='Continent')
plt.grid(True)
plt.show()


In [None]:
# Calculate percentage change in life expectancy over time
data['life_expectancy_pct_change'] = data['life_expectancy_at_birth_total_(years)'].pct_change() * 100

# Plot the percentage change in life expectancy over time
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='year', y='life_expectancy_pct_change', hue='continent', marker='o')
plt.title("Percentage Change in Life Expectancy Over Time by Continent")
plt.xlabel("Year")
plt.ylabel("Percentage Change in Life Expectancy")
plt.grid(True)
plt.show()
