<a href="https://colab.research.google.com/github/rjhanjee/datascience.github.io/blob/main/hw5_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests as rq
import bs4
import pandas as pd
from io import StringIO
import plotly.express as px

# Set a common user agent from a standard browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
page = rq.get(url, headers=headers) # Use headers in the request

bs4page = bs4.BeautifulSoup(page.text, 'html.parser')

# Try a broader search for tables first, then filter
# tables = bs4page.find_all('table')

tables = bs4page.find_all('table',{'class':"wikitable"})
# print(tables)
# Read the tables from the StringIO object into pandas
imf = pd.read_html(StringIO(str(tables[0])))[0]
imf=imf.dropna()

print(" Original table shows data by country and IMF, world bank and UN data")
print(" ")
print(imf.head())
print(" ")

# plot a stacked interactive bar plot using plotly. Stack countries within regions using the IMF numbers

# Select 'Country/Territory' and the IMF forecast column.
# Based on the display output, the IMF forecast is under the first level 'IMF[1][6]' and second level 'Forecast'.
imf_cleaned = imf[[('Country/Territory', 'Country/Territory'), ('IMF[1][6]', 'Forecast')]].copy()

# Flatten the multi-level columns for easier access
imf_cleaned.columns = ['_'.join(col).strip() for col in imf_cleaned.columns.values]

print("\nSelected columns after flattening:", imf_cleaned.columns)
print(' Data after cleaning it')
print(" ")
print(imf_cleaned.head())

# Rename columns
imf_cleaned.rename(columns={'Country/Territory_Country/Territory': 'Country', 'IMF[1][6]_Forecast': 'IMF_GDP'}, inplace=True)

# Remove rows where 'IMF_GDP' is not a valid number (contains non-digit characters like '—' or '[n 1]')
# Use a regex to keep only rows where the IMF_GDP column contains only digits.
imf_cleaned = imf_cleaned[imf_cleaned['IMF_GDP'].astype(str).str.match(r'^\d+$')].copy()

# Convert 'IMF_GDP' column to numeric
imf_cleaned['IMF_GDP'] = pd.to_numeric(imf_cleaned['IMF_GDP'])

# Display the first few rows and data types
print("\nCleaned DataFrame head:")
print(imf_cleaned.head())

print("\nCleaned DataFrame dtypes:")
print(imf_cleaned.dtypes)

## CREATE A STACKED BAR PLOT

# 1. Create a pandas DataFrame named country_region_df with two columns: 'Country' and 'Region'.
# Populate this DataFrame with country-to-region mappings for top 14 countries.
data = {'Country': ['World', 'United States', 'China', 'Germany', 'India',
                    'United Kingdom', 'France', 'Italy', 'Brazil', 'Canada',
                    'Japan', 'Russia', 'South Korea', 'Australia', 'Spain'],
        'Region': ['World', 'North America', 'Asia', 'Europe', 'Asia',
                   'Europe', 'Europe', 'Europe', 'South America', 'North America',
                   'Asia', 'Europe', 'Asia', 'Australia', 'Europe']}
country_region_df = pd.DataFrame(data)

print("Created Country-Region DataFrame:")
display(country_region_df.head(16))

# 2. Merge the imf_cleaned DataFrame with the country_region_df DataFrame.

merged_df = pd.merge(imf_cleaned, country_region_df, on='Country', how='left')

print("\nMerged DataFrame head (before handling missing regions):")
display(merged_df.head())

# 4. Create a stacked bar plot using plotly.express.bar.
# 5. Add a title to the plot.
fig = px.bar(merged_df,
             x='Region',
             y='IMF_GDP',
             color='Country',
             title='Stacked GDP by Country within Region (IMF Forecast)')

# 6. Display the created Plotly figure.
fig.show()

## SAVE THE PLOT AS HTML FILE

import plotly.offline as pyo
import os

# Save the figure as an HTML file
# pyo.plot(fig, filename='gdp_stacked_bar.html', include_plotlyjs='cdn', auto_open=False)

print("Plot saved as stacked_bar.html")

# Construct the full file path to save in the Colab /content directory
file_path = '/content/stacked_bar.html'

# # Save the plot to the specified path
pyo.plot(fig, filename=file_path, auto_open=False)

print(f"Plot saved as {file_path}")
from google.colab import files
files.download('/content/stacked_bar.html')



 Original table shows data by country and IMF, world bank and UN data
 
  Country/Territory  IMF[1][6]            World Bank[7]             \
  Country/Territory   Forecast       Year      Estimate       Year   
0             World  113795678       2025     111326370       2024   
1     United States   30507217       2025      29184890       2024   
2             China   19231705  [n 1]2025      18743803  [n 3]2024   
3           Germany    4744804       2025       4659929       2024   
4             India    4187017       2025       3912686       2024   

  United Nations[8]             
           Estimate       Year  
0         100834796       2022  
1          27720700       2023  
2          17794782  [n 1]2023  
3           4525704       2023  
4           3575778       2023  
 

Selected columns after flattening: Index(['Country/Territory_Country/Territory', 'IMF[1][6]_Forecast'], dtype='object')
 Data after cleaning it
 
  Country/Territory_Country/Territory IMF[1][6]_Forecast


Unnamed: 0,Country,Region
0,World,World
1,United States,North America
2,China,Asia
3,Germany,Europe
4,India,Asia
5,United Kingdom,Europe
6,France,Europe
7,Italy,Europe
8,Brazil,South America
9,Canada,North America



Merged DataFrame head (before handling missing regions):


Unnamed: 0,Country,IMF_GDP,Region
0,World,113795678,World
1,United States,30507217,North America
2,China,19231705,Asia
3,Germany,4744804,Europe
4,India,4187017,Asia


Plot saved as stacked_bar.html
Plot saved as /content/stacked_bar.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

# Load the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep="\t").drop(['Level5'], axis=1)

multilevel_lookup = multilevel_lookup.rename(columns={
    "modify": "roi",
    "modify.1": "level4",
    "modify.2": "level3",
    "modify.3": "level2",
    "modify.4": "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]

# Load the subject data
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]

# Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on="roi")
subjectData = subjectData.assign(icv="ICV")
# subjectData = subjectData.assign(comp=subjectData.volume / np.sum(subjectData.volume))
# truncate subjectData to 100 rows
subjectData = subjectData.head(100)
# print(subjectData)

# Create a list of unique labels for the Sankey diagram nodes
levels = ['icv', 'level1', 'level2', 'level3', 'level4', 'roi']
all_labels = []
for level in levels:
    all_labels.extend(subjectData[level].unique())
labels = list(dict.fromkeys(all_labels))

# Create a dictionary to map labels to indices
label_map = {label: i for i, label in enumerate(labels)}

# Initialize lists for sources, targets, and values
source = []
target = []
value = []

# Generate links for each level of the hierarchy
for i in range(len(levels) - 1):
    source_level = levels[i]
    target_level = levels[i+1]

    # Create a temporary DataFrame for the current level's links
    link_df = subjectData.groupby([source_level, target_level])['volume'].sum().reset_index()

    # Map the labels to their numerical indices
    source_indices = link_df[source_level].map(label_map).tolist()
    target_indices = link_df[target_level].map(label_map).tolist()
    link_values = link_df['volume'].tolist()

    source.extend(source_indices)
    target.extend(target_indices)
    value.extend(link_values)


# Create the Sankey plot
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
    )
)])

fig.update_layout(title_text=f"Sankey Diagram for Subject {id} (Type 1) for first 100 rows of data", font_size=10, width=800, height=800)
fig.show()

# save the figure as a standalone HTML file
fig.write_html("sankey.html")

# download html file
from google.colab import files
files.download('sankey.html')

# now save the figure as standalone index.HTML file
fig.write_html("index.html")

# download html file
from google.colab import files
files.download('index.html')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

website containing my sankey diagram
https://rjhanjee.github.io/datascience.github.io/