
# PEA15 - Annual Population Change
[Central Statistics Office - Population Estimates](https://data.cso.ie/table/PEA15)





[CA1](https://moodle.cct.ie/mod/assign/view.php?id=143374)



### Vital Events
- Annual deaths have been rising since 2017, while there's been a decline in annual births since a peak in 2010.

### Migration Patterns
- Immigration flows show a drop after 2020, with substantial increases thereafter, especially from the rest of the world.



## Data Loading Data Libraries and Functions
The dataset is loaded from the `../raw/PEA15.csv` file.


In [None]:
import pandas as pd
import altair as alt
# Enable the VegaFusion data transformer
alt.data_transformers.enable('vegafusion')
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [None]:
# Load the dataset
file_path = '../raw/PEA15.csv'
df = pd.read_csv(file_path)


## Exploratory Data Analysis
Exploring the basic structure, descriptive statistics, and identifying duplication and missing values.


In [None]:
df.head(24) # Extensive Missing Data  at head of dataset

In [None]:
df.columns

In [None]:
# Unique values in non-numeric columns
{col: df[col].unique() for col in df.select_dtypes(include='object').columns}

In [None]:
# Drop method for uniform 'STATISTIC Label' and 'UNIT' columns
df = df.drop(['STATISTIC Label', 'UNIT'], axis=1)
# Renaming the remaining columns
df = df.rename(columns={
    'Year': 'year',
    'Component': 'component',
    'VALUE': 'kvalue'  
})

In [None]:
df.sample(5)

  ### Cleaning  and Deduplication

In [None]:
# 
duplicates = df.duplicated()

# To see if there are any duplicates
any_duplicates = duplicates.any()

# Print result
print("DataFrame  contains duplicates is a ", any_duplicates, "statement.")


In [None]:
df.isna().sum()

In [None]:


# Check for missing values
missing_values = df.isnull()

# Group by 'Year' and count missing values for each field
missing_summary = missing_values.groupby(df.year).sum()


# Creating a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(missing_summary, annot=True, cmap='viridis', fmt='d')
plt.title('Heatmap of Missing Values Across Fields and Years')
plt.ylabel('year')
plt.xlabel('Field')
plt.show()


In [None]:
# Filter for rows where 'kvalue' is missing using .isnull() method
missing_value = df.kvalue.isnull()


# Filter the original DataFrame using this mask
df_missing_value = df[missing_value]

# Find the maximum year in this filtered DataFrame
last_year_missing = df_missing_value.year.max()  

print(f"The last year with missing data in the 'VALUE' field is: {last_year_missing}")

In [None]:
df.head()

In [None]:
# Finding the year-component combinations that have NaN values in the 'kvalue' field

# Filtering the data to find rows where 'kvalue' is NaN
nan_data = df[df['kvalue'].isna()]

# Extracting year-component combinations
nan_year_component_combinations = nan_data[['year', 'component']]

# Displaying the unique year-component combinations with NaN values
nan_year_component_combinations.drop_duplicates()


In [None]:
# Pivot table of natural changes to population
df_natural = df[df['component'].isin(['Annual births', 'Annual deaths', 'Natural increase'])].pivot(index='year', columns='component', values='kvalue')

# Earliest year where both 'Annual Births' and 'Annual Deaths' are available
first_complete_year = df_natural.dropna().index.min()

# Fill missing 'Annual Births' and 'Annual Deaths' backwards using the 'Natural Increase'
for year in range(first_complete_year - 1, df_natural.index.min() - 1, -1):
    df_natural.loc[year, 'Annual births'] = df_natural.loc[year + 1, 'Annual births'] - (df_natural.loc[year, 'Natural increase'] - df_natural.loc[year + 1, 'Natural increase'])
    df_natural.loc[year, 'Annual deaths'] = df_natural.loc[year, 'Annual births'] - df_natural.loc[year, 'Natural increase']


for col in df_natural.columns:
    df_natural[col] = df_natural[col].round()    
    
        
    # Save and check that imputation worked
df_natural.to_csv('../data/11_vit_natural.csv')
df_natural.head(22)


In [None]:


# Filtering the DataFrame to focus only on 'Immigrants', 'Emigrants', and 'Net migration'
df_not_natural = df[df['component'].isin(['Immigrants', 'Emigrants', 'Net migration'])]

# Pivot the DataFrame for easier manipulation
df_migration = df_not_natural.pivot(index='year', columns='component', values='kvalue')

# Identify the earliest year where both 'Immigrants' and 'Emigrants' data are available
first_complete_year_migration = df_migration.dropna().index.min()

# Fill missing 'Immigrants' and 'Emigrants' backwards using the 'Net migration'
for year in range(first_complete_year_migration - 1, df_migration.index.min() - 1, -1):
    df_migration.loc[year, 'Immigrants'] = df_migration.loc[year + 1, 'Immigrants'] - (df_migration.loc[year, 'Net migration'] - df_migration.loc[year + 1, 'Net migration'])
    df_migration.loc[year, 'Emigrants'] = df_migration.loc[year, 'Immigrants'] - df_migration.loc[year, 'Net migration']

    
for col in df_migration.columns:
    df_migration[col] = df_migration[col].round()     
    
# Save and display that 
df_migration.to_csv('../data/12_vital_migration.csv')
df_migration.head(68)


In [None]:
# Merging the two pivoted dataframes: df_pivot (births, deaths, natural increase) and df_migration_pivot (immigration data)
change_df = pd.merge(df_natural, df_migration, left_index=True, right_index=True, how='outer')

# Displaying the merged dataframe
change_df.to_csv('../data/13_vit_change.csv', index=True)
change_df.head(68)


In [None]:
# Rounding all values in change_df
for col in change_df.columns:
    change_df[col] = change_df[col].round()

# Displaying the first few rows after rounding
print(change_df.head())

df.to_csv('../data/14_vit_change_clean.csv', index=False)

In [None]:
change_df.isna().sum()

The strategy above  was to estimate 'Annual Births' and 'Annual Deaths' based on the 'Natural Increase' (which is Births - Deaths), and the patterns observed in the available data for years where these values are known. Here's a basic approach to this:

In [None]:
change_df.columns

### Vital Events
- Annual deaths have been, in the main, rising  since 2005, while there's been a sharp decline in annual births since a peak in 2010.

In [None]:

# string = 'vital changes'

plt.figure(figsize=(12, 12))
plt.plot(change_df.index, change_df['Annual births'], marker='o', label='Annual births')
plt.plot(change_df.index, change_df['Annual deaths'], marker='o', label='Annual deaths')
plt.plot(change_df.index, change_df['Natural increase'], marker='o', label='Natural increase')
plt.title('Vital Events Over Time')
plt.xlabel('Year')
plt.ylabel('Values (000s)')
plt.legend()
plt.grid(True)

# Save the plot before displaying it
plt.savefig('../images/12_vit_vital_events.png')

# Then display the plot
plt.show()

In [None]:
string = 'Annual births'

input_date = 2004


# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o', label=string)

# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/13_vit_{string}_since_{input_date}.png')


# Display the plot
plt.show()

In [None]:
# Define the string and input date
string = 'Annual deaths'
input_date = 2004


# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o',color='orange', label=string)

# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/14_vit_{string}_since_{input_date}.png')

# Display the plot
plt.show()

In [None]:

# Define the string and input date
string = 'Natural increase'
input_date = 2000



# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o', label=string, color='green')

# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/15_vit_{string}_since_{input_date}.png')


# Display the plot
plt.show()


### Migration Patterns
- Immigration dropped in 2020, put showed a substantial increases since, especially from the rest of the world.


In [None]:
import matplotlib.pyplot as plt

# Assuming change_df is your DataFrame
# Plotting 'Emigrants', 'Immigrants', 'Net migration' on one graph with specified colors
plt.figure(figsize=(12, 9))
plt.plot(change_df.index, change_df['Emigrants'], marker='o', label='Emigrants', color='red')
plt.plot(change_df.index, change_df['Immigrants'], marker='o', label='Immigrants', color='purple')
plt.plot(change_df.index, change_df['Net migration'], marker='o', label='Net migration', color='brown')
plt.title('Emigrants, Immigrants, Net migration since 1950')
plt.xlabel('Year')
plt.ylabel('Values')
plt.legend()
plt.grid(True)

# Save the plot before displaying it
plt.savefig(f'../images/14_flow_since_1950.png')

plt.show()


In [None]:
# strings = ['Annual births', 'Annual deaths', 'Natural increase', 'Emigrants', 'Immigrants', 'Net migration']

In [None]:

# Define the string and input date
string = 'Emigrants'
input_date = 2000




# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o', label=string,color='red')

# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/16_vit_{string}_since_{input_date}.png')

# Display the plot
plt.show()


In [None]:


# Define the string and input date
string = 'Immigrants'
input_date = 2000



# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o', label=string,color='purple')

# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/17_vit_{string}_since_{input_date}.png')

# Display the plot
plt.show()


In [None]:

# Define the string and input date
string = 'Net migration'
input_date = 2000



# Filter the data for the desired years
filtered_df = change_df[change_df.index >= input_date]

# Create a figure with a specified size
plt.figure(figsize=(12, 9))

# Plot the data with markers and labels
plt.plot(filtered_df.index, filtered_df[string], marker='o', label=string,color='brown')


# Set the title, axis labels, and legend
title = f'This plot illustrates the trend of {string} Over Time since {input_date}'
plt.title(title)
plt.xlabel('Year')
plt.ylabel(string)

# Customize the legend label
plt.legend([string])

# Add a grid and a LaTeX-style caption with variables
plt.grid(True)
caption = f"This plot illustrates the trend of {string} over time since {input_date}."
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=10)

# Print the title as text to speed up report writing
print(title)

# Save the plot before displaying it
plt.savefig(f'../images/18_vit_{string}_since_{input_date}.png')


# Display the plot
plt.show()


### Ongoing research
These mortality, fertility, and migration figures will be merged with population and other data from  [Central Statistics Office in Ireland  population of Ireland series.](https://data.cso.ie/product/pme)