In [None]:
# Import required libraries
import pandas as pd
import pygwalker as pyg

# Step 1: Load the Dataset
# Load the CO2 emissions data
data_path = 'owid-co2-data.csv'
codebook_path = 'owid-co2-codebook.csv'

co2_data = pd.read_csv(data_path)
codebook = pd.read_csv(codebook_path)

# Step 2: Data Cleaning and Preprocessing
# Remove rows with missing values in critical columns (co2, primary_energy_consumption)
co2_data_cleaned = co2_data.dropna(subset=['co2', 'primary_energy_consumption'])

# Exclude aggregate regions like 'World', 'Asia', etc.
excluded_regions = ['World', 'Africa', 'Asia', 'Europe', 'North America', 'South America', 'European Union']
co2_data_cleaned = co2_data_cleaned[~co2_data_cleaned['country'].isin(excluded_regions)]

# Select relevant columns for analysis
columns_to_keep = [
    'country', 'year', 'population', 'gdp', 'co2', 'co2_per_capita',
    'primary_energy_consumption', 'co2_per_unit_energy'
]
co2_data_cleaned = co2_data_cleaned[columns_to_keep]

# Ensure year is treated as an integer
co2_data_cleaned['year'] = co2_data_cleaned['year'].astype(int)

# Step 3: Exploratory Data Analysis with PyGWalker
# Launch PyGWalker for interactive analysis
print("Launching PyGWalker...")
pyg.walk(co2_data_cleaned)

# Step 4: Save the Cleaned Dataset
# Save the cleaned data for reproducibility
cleaned_data_path = 'cleaned_owid_co2_data.csv'
co2_data_cleaned.to_csv(cleaned_data_path, index=False)

# Step 5: Generate Summary Reports
# Create summary statistics for insights
summary = co2_data_cleaned.describe()

# Display key findings
print("\nData Cleaning Completed.")
print(f"Cleaned data saved to: {cleaned_data_path}")
print("\nSummary Statistics:")
print(summary)

# Step 7: Optional - Define Key Questions for Analysis
print("\nSuggested Questions for Analysis:")
print("1. How have global CO2 emissions changed over time?")
print("2. Which countries have the highest CO2 emissions per capita?")
print("3. Is there a relationship between energy consumption and total CO2 emissions?")
print("4. How has the carbon intensity of electricity changed over the years?")
print("5. Which countries or regions have improved their carbon efficiency the most?")


Launching PyGWalker...


Box(children=(HTML(value='\n<div id="ifr-pyg-00062c3be5fdee43udkJqwDNRrWtHsUV" style="height: auto">\n    <hea…


Data Cleaning Completed.
Cleaned data saved to: cleaned_owid_co2_data.csv

Summary Statistics:
              year    population           gdp           co2  co2_per_capita  \
count  9746.000000  9.711000e+03  7.672000e+03   9746.000000     9711.000000   
mean   1998.294685  6.947513e+07  4.292114e+11    307.645348        5.213641   
std      14.812312  2.815536e+08  1.495612e+12   1477.570732        8.083260   
min    1965.000000  1.776000e+03  1.642060e+08      0.000000        0.012000   
25%    1987.000000  1.499924e+06  1.787951e+10      1.487750        0.656000   
50%    1999.000000  7.569397e+06  6.466988e+10     12.322500        2.713000   
75%    2011.000000  2.547886e+07  2.573387e+11     76.404250        7.467000   
max    2023.000000  3.253459e+09  2.696602e+13  17581.070000      364.688000   

       primary_energy_consumption  co2_per_unit_energy  
count                 9746.000000          9701.000000  
mean                  1333.895184             0.230148  
std         