<a href="https://colab.research.google.com/github/rjhanjee/my-first-repo/blob/main/hw5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import requests
import bs4
import pandas as pd
from io import StringIO

# Set a common user agent from a standard browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Request successful! Content preview:")
    print(response.text[:500]) # Prints the first 500 characters of the page
else:
    print(f"Request failed with status code: {response.status_code}")

bs4page = bs4.BeautifulSoup(response.text, 'html.parser')
# Find all tables with the class 'wikitable' - let's try a broader search first
tables = bs4page.find_all('table',{'class':"wikitable"})

print(f"Number of tables found with class 'wikitable': {len(tables)}")
if len(tables) > 0:
    print("Preview of the first table found:")
    print(tables[0].prettify()[:1000]) # Print the first 1000 characters of the first table

# Read the table directly using pandas read_html on the table element
# Based on inspection, the relevant tables might be the first few 'wikitable' classes.
# Let's try reading all tables with class 'wikitable' and then inspect them.
all_wikitables_dfs = pd.read_html(str(tables))

print(f"Number of dataframes extracted from 'wikitable' class: {len(all_wikitables_dfs)}")

# Assuming the GDP table is one of the first few, let's display the head of the first few to identify
if len(all_wikitables_dfs) > 0:
    print("Preview of the first extracted dataframe:")
    display(all_wikitables_dfs[0].head())
if len(all_wikitables_dfs) > 1:
    print("Preview of the second extracted dataframe:")
    display(all_wikitables_dfs[1].head())
if len(all_wikitables_dfs) > 2:
    print("Preview of the third extracted dataframe:")
    display(all_wikitables_dfs[2].head())


# Once the correct table index is identified, we can select it.
# For now, let's assume the first table is the one we want (this will need adjustment based on inspection)
gdp_df = all_wikitables_dfs[0].dropna()
display(gdp_df.head())

## INSPECT AND PREPARE DATA
# Inspect column names
print("Original columns:", gdp_df.columns)

# Select 'Country/Territory' and the IMF forecast column.
# Based on the display output, the relevant tables might be the first few 'wikitable' classes.
# Let's try reading all tables with class 'wikitable' and then inspect them.
all_wikitables_dfs = pd.read_html(str(tables))

print(f"Number of dataframes extracted from 'wikitable' class: {len(all_wikitables_dfs)}")

# Assuming the GDP table is one of the first few, let's display the head of the first few to identify
if len(all_wikitables_dfs) > 0:
    print("Preview of the first extracted dataframe:")
    display(all_wikitables_dfs[0].head())
if len(all_wikitables_dfs) > 1:
    print("Preview of the second extracted dataframe:")
    display(all_wikitables_dfs[1].head())
if len(all_wikitables_dfs) > 2:
    print("Preview of the third extracted dataframe:")
    display(all_wikitables_dfs[2].head())


# Once the correct table index is identified, we can select it.
# For now, let's assume the first table is the one we want (this will need adjustment based on inspection)
gdp_df = all_wikitables_dfs[0].dropna()
display(gdp_df.head())

## INSPECT AND PREPARE DATA
# Inspect column names
print("Original columns:", gdp_df.columns)

# Select 'Country/Territory' and the IMF forecast column.
# Based on the display output, the IMF forecast is under the first level 'IMF[1][6]' and second level 'Forecast'.
gdp_cleaned = gdp_df[[('Country/Territory', 'Country/Territory'), ('IMF[1][6]', 'Forecast')]].copy()

# Flatten the multi-level columns for easier access
gdp_cleaned.columns = ['_'.join(col).strip() for col in gdp_cleaned.columns.values]

print("\nSelected columns after flattening:", gdp_cleaned.columns)
display(gdp_cleaned.head())

# Rename columns
gdp_cleaned.rename(columns={'Country/Territory_Country/Territory': 'Country', 'IMF[1][6]_Forecast': 'IMF_GDP'}, inplace=True)

# Remove rows where 'IMF_GDP' is not a valid number (contains non-digit characters like '—' or '[n 1]')
# Use a regex to keep only rows where the IMF_GDP column contains only digits.
gdp_cleaned = gdp_cleaned[gdp_cleaned['IMF_GDP'].astype(str).str.match(r'^\d+$')].copy()

# Convert 'IMF_GDP' column to numeric
gdp_cleaned['IMF_GDP'] = pd.to_numeric(gdp_cleaned['IMF_GDP'])

# Display the first few rows and data types
print("\nCleaned DataFrame head:")
display(gdp_cleaned.head())

print("\nCleaned DataFrame dtypes:")
print(gdp_cleaned.dtypes)

## CREATE A STACKED BAR PLOT

import pandas as pd
import plotly.express as px

# 1. Create a pandas DataFrame named country_region_df with two columns: 'Country' and 'Region'.
# Populate this DataFrame with country-to-region mappings.
data = {'Country': ['World', 'United States', 'China', 'Germany', 'India',
                    'United Kingdom', 'France', 'Italy', 'Brazil', 'Canada',
                    'Japan', 'Russia', 'South Korea', 'Australia', 'Spain'],
        'Region': ['World', 'North America', 'Asia', 'Europe', 'Asia',
                   'Europe', 'Europe', 'Europe', 'South America', 'North America',
                   'Asia', 'Europe', 'Asia', 'Oceania', 'Europe']}
country_region_df = pd.DataFrame(data)

print("Created Country-Region DataFrame:")
display(country_region_df.head())

# 2. Merge the gdp_cleaned DataFrame with the country_region_df DataFrame.
# Assuming gdp_cleaned is available from previous successful steps.
merged_df = pd.merge(gdp_cleaned, country_region_df, on='Country', how='left')

print("\nMerged DataFrame head (before handling missing regions):")
display(merged_df.head())

# 3. Handle any potential missing region values in the merged_df.
# Assign 'Unknown' to missing regions.
merged_df['Region'].fillna('Unknown', inplace=True)

print("\nMerged DataFrame head (after handling missing regions):")
display(merged_df.head())

# 4. Create a stacked bar plot using plotly.express.bar.
# 5. Add a title to the plot.
fig = px.bar(merged_df,
             x='Region',
             y='IMF_GDP',
             color='Country',
             title='Stacked GDP by Country within Region (IMF Forecast)')

# 6. Display the created Plotly figure.
fig.show()

## CUSTOMIZE THE PLOT

# 1. Modify the px.bar function call to include hover_name='Country' and hover_data={'Country': False, 'IMF_GDP': ':,2s'}
fig = px.bar(merged_df,
             x='Region',
             y='IMF_GDP',
             color='Country',
             title='Stacked GDP by Country within Region (IMF Forecast)',
             hover_name='Country',
             hover_data={'Country': False, 'IMF_GDP': ':,2s'}) # Format GDP as millions USD

# 2. Update the layout of the figure using fig.update_layout()
fig.update_layout(
    xaxis_title='Region',
    yaxis_title='IMF GDP (Millions USD)',
    title={'x':0.5, 'xanchor': 'center'} # Center the title
)

# 3. Display the updated figure using fig.show()
fig.show()

## SAVE THE PLOT AS HTML FILE

import plotly.offline as pyo
import os

# Save the figure as an HTML file
# pyo.plot(fig, filename='gdp_stacked_bar_plot.html', include_plotlyjs='cdn', auto_open=False)

print("Plot saved as gdp_stacked_bar_plot.html")

# Construct the full file path to save in the Colab /content directory
file_path = '/content/stacked_bar.html'

# # Save the plot to the specified path
pyo.plot(fig, filename=file_path, auto_open=False)

print(f"Plot saved as {file_path}")
from google.colab import files
files.download('/content/stacked_bar.html')

Request successful! Content preview:
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect
Number of tables found with class 'wikitable': 1
Preview of the first table found:
<table class="wikitable sortable sticky-header-multi static-row-numbers" style="text-align:right">
 <caption>
  GDP forecast or estimate (million US$) by country
 </caption>
 <tbody>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
   <th rowspan="2">
    Country/Territory
   </th>
   <th colspan="2">
    <a href="/wiki/International_Monetary_F


Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



Unnamed: 0_level_0,Country/Territory,IMF[1][6],IMF[1][6],World Bank[7],World Bank[7],United Nations[8],United Nations[8]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,111326370,2024,100834796,2022
1,United States,30507217,2025,29184890,2024,27720700,2023
2,China,19231705,[n 1]2025,18743803,[n 3]2024,17794782,[n 1]2023
3,Germany,4744804,2025,4659929,2024,4525704,2023
4,India,4187017,2025,3912686,2024,3575778,2023


Unnamed: 0_level_0,Country/Territory,IMF[1][6],IMF[1][6],World Bank[7],World Bank[7],United Nations[8],United Nations[8]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,111326370,2024,100834796,2022
1,United States,30507217,2025,29184890,2024,27720700,2023
2,China,19231705,[n 1]2025,18743803,[n 3]2024,17794782,[n 1]2023
3,Germany,4744804,2025,4659929,2024,4525704,2023
4,India,4187017,2025,3912686,2024,3575778,2023


Original columns: MultiIndex([('Country/Territory', 'Country/Territory'),
            (        'IMF[1][6]',          'Forecast'),
            (        'IMF[1][6]',              'Year'),
            (    'World Bank[7]',          'Estimate'),
            (    'World Bank[7]',              'Year'),
            ('United Nations[8]',          'Estimate'),
            ('United Nations[8]',              'Year')],
           )
Number of dataframes extracted from 'wikitable' class: 1
Preview of the first extracted dataframe:



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



Unnamed: 0_level_0,Country/Territory,IMF[1][6],IMF[1][6],World Bank[7],World Bank[7],United Nations[8],United Nations[8]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,111326370,2024,100834796,2022
1,United States,30507217,2025,29184890,2024,27720700,2023
2,China,19231705,[n 1]2025,18743803,[n 3]2024,17794782,[n 1]2023
3,Germany,4744804,2025,4659929,2024,4525704,2023
4,India,4187017,2025,3912686,2024,3575778,2023


Unnamed: 0_level_0,Country/Territory,IMF[1][6],IMF[1][6],World Bank[7],World Bank[7],United Nations[8],United Nations[8]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,111326370,2024,100834796,2022
1,United States,30507217,2025,29184890,2024,27720700,2023
2,China,19231705,[n 1]2025,18743803,[n 3]2024,17794782,[n 1]2023
3,Germany,4744804,2025,4659929,2024,4525704,2023
4,India,4187017,2025,3912686,2024,3575778,2023


Original columns: MultiIndex([('Country/Territory', 'Country/Territory'),
            (        'IMF[1][6]',          'Forecast'),
            (        'IMF[1][6]',              'Year'),
            (    'World Bank[7]',          'Estimate'),
            (    'World Bank[7]',              'Year'),
            ('United Nations[8]',          'Estimate'),
            ('United Nations[8]',              'Year')],
           )

Selected columns after flattening: Index(['Country/Territory_Country/Territory', 'IMF[1][6]_Forecast'], dtype='object')


Unnamed: 0,Country/Territory_Country/Territory,IMF[1][6]_Forecast
0,World,113795678
1,United States,30507217
2,China,19231705
3,Germany,4744804
4,India,4187017



Cleaned DataFrame head:


Unnamed: 0,Country,IMF_GDP
0,World,113795678
1,United States,30507217
2,China,19231705
3,Germany,4744804
4,India,4187017



Cleaned DataFrame dtypes:
Country    object
IMF_GDP     int64
dtype: object
Created Country-Region DataFrame:


Unnamed: 0,Country,Region
0,World,World
1,United States,North America
2,China,Asia
3,Germany,Europe
4,India,Asia



Merged DataFrame head (before handling missing regions):


Unnamed: 0,Country,IMF_GDP,Region
0,World,113795678,World
1,United States,30507217,North America
2,China,19231705,Asia
3,Germany,4744804,Europe
4,India,4187017,Asia



Merged DataFrame head (after handling missing regions):



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0,Country,IMF_GDP,Region
0,World,113795678,World
1,United States,30507217,North America
2,China,19231705,Asia
3,Germany,4744804,Europe
4,India,4187017,Asia


Plot saved as gdp_stacked_bar_plot.html
Plot saved as /content/stacked_bar.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# code to display a subject's MRICloud data as a sunburst plot.
#Do the following. Display this subject's data as a Sankey diagramLinks to an external site..
# Display as many levels as you can (at least 3) for Type = 1, starting from the intracranial volume.

import pandas as pd
import plotly.express as px
import numpy as np