# US Population Data

## Load data

In [6]:
import pandas as pd

# قراءة الملف
df = pd.read_csv(r"C:\Users\shahe\OneDrive\Desktop\airquality\datasets\us-population-2010-2019.csv")

# عرض أسماء الأعمدة
print("📌 أسماء الأعمدة:")
print(df.columns)

# عرض أول 5 صفوف
print("\n📊 أول 5 صفوف من البيانات:")
print(df.head())



📌 أسماء الأعمدة:
Index(['states', 'states_code', 'id', 'year', 'population'], dtype='object')

📊 أول 5 صفوف من البيانات:
       states states_code  id  year  population
0     Alabama          AL   1  2010     4785437
1      Alaska          AK   2  2010      713910
2     Arizona          AZ   4  2010     6407172
3    Arkansas          AR   5  2010     2921964
4  California          CA   6  2010    37319502


In [2]:
df.columns

Index(['states', 'states_code', 'id', 'year', 'population'], dtype='object')

In [3]:
new_columns = ['states', 'states_code', 'id', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019']
df = df.reindex(columns=new_columns)
df

Unnamed: 0,states,states_code,id,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Alabama,AL,1,,,,,,,,,,
1,Alaska,AK,2,,,,,,,,,,
2,Arizona,AZ,4,,,,,,,,,,
3,Arkansas,AR,5,,,,,,,,,,
4,California,CA,6,,,,,,,,,,
5,Texas,TX,48,,,,,,,,,,
6,Florida,FL,12,,,,,,,,,,
7,New York,NY,36,,,,,,,,,,
8,Pennsylvania,PA,42,,,,,,,,,,
9,Illinois,IL,17,,,,,,,,,,


In [4]:
# Save data to CSV
df.to_csv('us-population-2010-2019-states-code.csv', index=False)

## Data pre-processing

In [7]:
# Reshape the DataFrame
df_reshaped = pd.melt(df, id_vars=['states', 'states_code', 'id'], var_name='year', value_name='population')

# Convert 'year' column values to integers
df_reshaped['states'] = df_reshaped['states'].astype(str)
df_reshaped['year'] = df_reshaped['year'].astype(int)
df_reshaped['population'] = df_reshaped['population'].str.replace(',', '').astype(int)

df_reshaped

ValueError: value_name (population) cannot match an element in the DataFrame columns.

In [None]:
# Save reshaped data to CSV
df_reshaped.to_csv('us-population-2010-2019-reshaped.csv')

In [None]:
# Subset dataframe by year
selected_year = 2019
df_selected_year = df_reshaped[df_reshaped.year == selected_year]
df_selected_year

Unnamed: 0,states,states_code,id,year,population
468,Alabama,,1,2019,4903185
469,Alaska,,2,2019,731545
470,Arizona,,4,2019,7278717
471,Arkansas,,5,2019,3017804
472,California,,6,2019,39512223
473,Colorado,,8,2019,5758736
474,Connecticut,,9,2019,3565287
475,Delaware,,10,2019,973764
476,District of Columbia,,11,2019,705749
477,Florida,,12,2019,21477737


In [None]:
# Sort by year
df_selected_year_sorted = df_selected_year.sort_values(by="population", ascending=False)
df_selected_year_sorted

Unnamed: 0,states,states_code,id,year,population
472,California,,6,2019,39512223
511,Texas,,48,2019,28995881
477,Florida,,12,2019,21477737
500,New York,,36,2019,19453561
506,Pennsylvania,,42,2019,12801989
481,Illinois,,17,2019,12671821
503,Ohio,,39,2019,11689100
478,Georgia,,13,2019,10617423
501,North Carolina,,37,2019,10488084
490,Michigan,,26,2019,9986857


In [None]:
# Calculate population difference between selected and previous year
def calculate_population_difference(input_df, input_year):
  selected_year_data = input_df[input_df['year'] == input_year].reset_index()
  previous_year_data = input_df[input_df['year'] == input_year - 1].reset_index()
  selected_year_data['population_difference'] = selected_year_data.population.sub(previous_year_data.population, fill_value=0)
  selected_year_data['population_difference_absolute'] = abs(selected_year_data['population_difference'])
  return pd.concat([selected_year_data.states, selected_year_data.id, selected_year_data.population, selected_year_data.population_difference, selected_year_data.population_difference_absolute], axis=1).sort_values(by="population_difference", ascending=False)

df_population_difference_sorted = calculate_population_difference(df_reshaped, selected_year)
df_population_difference_sorted

Unnamed: 0,states,id,population,population_difference,population_difference_absolute
43,Texas,48,28995881,367215,367215
9,Florida,12,21477737,233420,233420
2,Arizona,4,7278717,120693,120693
33,North Carolina,37,10488084,106469,106469
10,Georgia,13,10617423,106292,106292
47,Washington,53,7614893,91024,91024
5,Colorado,8,5758736,67449,67449
40,South Carolina,45,5148714,64558,64558
42,Tennessee,47,6829174,57543,57543
28,Nevada,32,3080156,52815,52815


In [None]:
# Filter states with population difference > 50000
df_greater_50000 = df_population_difference_sorted[df_population_difference_sorted.population_difference_absolute > 50000]
df_greater_50000

Unnamed: 0,states,id,population,population_difference,population_difference_absolute
43,Texas,48,28995881,367215,367215
9,Florida,12,21477737,233420,233420
2,Arizona,4,7278717,120693,120693
33,North Carolina,37,10488084,106469,106469
10,Georgia,13,10617423,106292,106292
47,Washington,53,7614893,91024,91024
5,Colorado,8,5758736,67449,67449
40,South Carolina,45,5148714,64558,64558
42,Tennessee,47,6829174,57543,57543
28,Nevada,32,3080156,52815,52815


In [None]:
# % of States with population difference > 50000
int((len(df_greater_50000)/df_population_difference_sorted.states.nunique())*100)

26

## Plots

### Heatmap

In [None]:
import subprocess
import sys

try:
    import altair as alt
    print("✅ Altair is already installed!")
except ModuleNotFoundError:
    print("⚡ Altair not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "altair", "vega_datasets"])
    import altair as alt
    print("✅ Altair installed and imported successfully!")


⚡ Altair not found. Installing...
✅ Altair installed and imported successfully!


In [None]:
import altair as alt

alt.themes.enable("dark")

heatmap = alt.Chart(df_reshaped).mark_rect().encode(
        y=alt.Y('year:O', axis=alt.Axis(title="Year", titleFontSize=16, titlePadding=15, titleFontWeight=900, labelAngle=0)),
        x=alt.X('states:O', axis=alt.Axis(title="States", titleFontSize=16, titlePadding=15, titleFontWeight=900)),
        color=alt.Color('max(population):Q',
                         legend=alt.Legend(title=" "),
                         scale=alt.Scale(scheme="blueorange")),
        stroke=alt.value('black'),
        strokeWidth=alt.value(0.25),
        #tooltip=[
        #    alt.Tooltip('year:O', title='Year'),
        #    alt.Tooltip('population:Q', title='Population')
        #]
    ).properties(width=900
    #).configure_legend(orient='bottom', titleFontSize=16, labelFontSize=14, titlePadding=0
    #).configure_axisX(labelFontSize=14)
    ).configure_axis(
    labelFontSize=12,
    titleFontSize=12
    )

heatmap

Deprecated since `altair=5.5.0`. Use altair.theme instead.
Most cases require only the following change:

    # Deprecated
    alt.themes.enable('quartz')

    # Updated
    alt.theme.enable('quartz')

If your code registers a theme, make the following change:

    # Deprecated
    def custom_theme():
        return {'height': 400, 'width': 700}
    alt.themes.register('theme_name', custom_theme)
    alt.themes.enable('theme_name')

    # Updated
    @alt.theme.register('theme_name', enable=True)
    def custom_theme():
        return alt.theme.ThemeConfig(
            {'height': 400, 'width': 700}
        )

See the updated User Guide for further details:
    https://altair-viz.github.io/user_guide/api.html#theme
    https://altair-viz.github.io/user_guide/customization.html#chart-themes
  alt.themes.enable("dark")


### Choropleth

In [None]:
# Choropleth via Altair
import altair as alt
from vega_datasets import data

alt.themes.enable("dark")

states = alt.topo_feature(data.us_10m.url, 'states')

alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('population:Q', scale=alt.Scale(scheme='blues')),   # scale=color_scale
    stroke=alt.value('#154360')
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(df_selected_year, 'id', list(df_selected_year.columns))
).properties(
    width=500,
    height=300
).project(
    type='albersUsa'
)



In [None]:
import subprocess
import sys

try:
    import plotly.express as px
    print("✅ Plotly is already installed!")
except ModuleNotFoundError:
    print("⚡ Plotly not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly"])
    import plotly.express as px
    print("✅ Plotly installed and imported successfully!")


⚡ Plotly not found. Installing...
✅ Plotly installed and imported successfully!


In [None]:
# Choropleth via Plotly
import plotly.express as px

choropleth = px.choropleth(df_selected_year, locations='states_code', color='population', locationmode="USA-states",
                               color_continuous_scale='blues',
                               range_color=(0, max(df_selected_year.population)),
                               scope="usa",
                               labels={'population':'Population'}
                              )
choropleth.update_layout(
        template='plotly_dark',
        plot_bgcolor='rgba(0, 0, 0, 0)',
        paper_bgcolor='rgba(0, 0, 0, 0)',
        margin=dict(l=0, r=0, t=0, b=0),
        height=350
    )

choropleth

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed