# Integrated Continuous Assessment 1
### Kalina Cmuchalska
#### SBS23080
---
#### Table of Content
1. [Data Perparation](#Data-Preparation)
    1. [Estimated population dataset](#Estimated-population-dataset)
    2. [Estimated immigration dataset](#Estimated-immigration-dataset)
    3. [Estimated emigration dataset](#Estimated-emigration-dataset)
2. [Exploratory Data Analysis (EDA)](#Exploratory-Data-Analysis)
3. [Statistical Analysis](#Statistical-Analysis)
    1. [Descriptive Statistics](#Descriptive-statistics)
    2. [Binomal Distribution](#Binomal-Distribution)
    3. [Poisson Distribution](#Poisson-Distribution)
    4. [Normal Distribution](#Normal-Distribution)
4. [Machine Learning Analysis](#Machine-Learning-Analysis)
---

# Data Preparation
---

In [2]:
# imprting required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os

In [91]:
# get paths for datasets
for dirname, _, filenames in os.walk('datasets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

datasets\estimated_emigration_2006_2023\PEA23.20231011T231143.csv
datasets\estimated_immigration_1996_2023\PEA24.20231011T231141.csv
datasets\estimated_population_2011_2023\PEA04.20231011T231134.csv


In [93]:
#read datasets
df_est_population = pd.read_csv('datasets\estimated_population_2011_2023\PEA04.20231011T231134.csv')
df_est_immigration = pd.read_csv('datasets\estimated_immigration_1996_2023\PEA24.20231011T231141.csv')
df_est_emigration = pd.read_csv('datasets\estimated_emigration_2006_2023\PEA23.20231011T231143.csv')

> ## Estimated population dataset

In [94]:
df_est_population.shape

(6669, 7)

In [95]:
df_est_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6669 entries, 0 to 6668
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  6669 non-null   object 
 1   Year             6669 non-null   int64  
 2   Age Group        6669 non-null   object 
 3   Sex              6669 non-null   object 
 4   Region           6669 non-null   object 
 5   UNIT             6669 non-null   object 
 6   VALUE            6669 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 364.8+ KB


In [96]:
#shape of the data (no of rows, no of cols)
df_est_population.shape

(6669, 7)

In [97]:
#list the columns from dataset
df_est_population.columns

Index(['STATISTIC Label', 'Year', 'Age Group', 'Sex', 'Region', 'UNIT',
       'VALUE'],
      dtype='object')

In [98]:
#view top 5 rows
df_est_population.head()

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,Region,UNIT,VALUE
0,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,State,Thousand,356.0
1,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,Border,Thousand,30.7
2,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,West,Thousand,32.6
3,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,Mid-West,Thousand,35.0
4,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,South-East,Thousand,32.0


In [99]:
#view last 5 rows
df_est_population.tail()

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,Region,UNIT,VALUE
6664,Estimated Population (Persons in April),2023,All ages,Female,South-East,Thousand,237.5
6665,Estimated Population (Persons in April),2023,All ages,Female,South-West,Thousand,383.8
6666,Estimated Population (Persons in April),2023,All ages,Female,Dublin,Thousand,765.6
6667,Estimated Population (Persons in April),2023,All ages,Female,Mid-East,Thousand,391.3
6668,Estimated Population (Persons in April),2023,All ages,Female,Midland,Thousand,163.2


In [100]:
#check if different UNIT value is present in the dataset using pandas.Series.any method
~(df_est_population['UNIT'].isin(['Thousand'])).any()

False

In [101]:
#check if different STATISTIC Label value is present in the dataset using pandas.Series.any method
~(df_est_population['STATISTIC Label'].isin(['Estimated Population (Persons in April)'])).any()

False

In [102]:
#drop columns that are not important or useful for analysis
drop_cols = ['STATISTIC Label', 'UNIT']
df_est_population = df_est_population.drop(drop_cols, axis=1)

In [103]:
df_est_population.shape

(6669, 5)

In [104]:
#check for missing values in dataset
df_est_population.isnull().sum()

Year         0
Age Group    0
Sex          0
Region       0
VALUE        0
dtype: int64

In [105]:
#drop duplicates rows if any
df_est_population = df_est_population.drop_duplicates(keep='first')

In [106]:
#check columns data types
df_est_population.dtypes

Year           int64
Age Group     object
Sex           object
Region        object
VALUE        float64
dtype: object

In [107]:
#rename VALUE column to more specific name
df_est_population.rename(columns={'VALUE':'Population (t)'}, inplace = True)

In [108]:
df_est_population.columns

Index(['Year', 'Age Group', 'Sex', 'Region', 'Population (t)'], dtype='object')

> ## Estimated immigration dataset

In [109]:
df_est_immigration.shape

(504, 6)

In [110]:
df_est_immigration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  504 non-null    object 
 1   Year             504 non-null    int64  
 2   Sex              504 non-null    object 
 3   Citizenship      504 non-null    object 
 4   UNIT             504 non-null    object 
 5   VALUE            477 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 23.8+ KB


In [111]:
#list the columns from dataset
df_est_immigration.columns

Index(['Statistic Label', 'Year', 'Sex', 'Citizenship', 'UNIT', 'VALUE'], dtype='object')

In [112]:
#view top 5 rows
df_est_immigration.head()

Unnamed: 0,Statistic Label,Year,Sex,Citizenship,UNIT,VALUE
0,Estimated Immigration (Persons in April),1996,Both sexes,All Countries,Thousand,39.2
1,Estimated Immigration (Persons in April),1996,Both sexes,EU14 excl Irl (countries in the EU pre 2004 ex...,Thousand,5.0
2,Estimated Immigration (Persons in April),1996,Both sexes,EU15 to EU27 (accession countries joined post ...,Thousand,
3,Estimated Immigration (Persons in April),1996,Both sexes,United Kingdom,Thousand,8.3
4,Estimated Immigration (Persons in April),1996,Both sexes,Ireland,Thousand,17.7


In [113]:
#view last 5 rows
df_est_immigration.tail()

Unnamed: 0,Statistic Label,Year,Sex,Citizenship,UNIT,VALUE
499,Estimated Immigration (Persons in April),2023,Female,EU14 excl Irl (countries in the EU pre 2004 ex...,Thousand,7.7
500,Estimated Immigration (Persons in April),2023,Female,EU15 to EU27 (accession countries joined post ...,Thousand,5.8
501,Estimated Immigration (Persons in April),2023,Female,United Kingdom,Thousand,2.5
502,Estimated Immigration (Persons in April),2023,Female,Ireland,Thousand,14.3
503,Estimated Immigration (Persons in April),2023,Female,"All countries excluding Ireland,United Kingdom...",Thousand,45.3


In [114]:
#check if different UNIT value is present in the dataset using pandas.Series.any method
~(df_est_immigration['UNIT'].isin(['Thousand'])).any()

False

In [115]:
#check if different STATISTIC Label value is present in the dataset using pandas.Series.any method
~(df_est_immigration['Statistic Label'].isin(['Estimated Immigration (Persons in April)'])).any()

False

In [116]:
#drop columns that are not important or useful for analysis
drop_cols_2 = ['Statistic Label', 'UNIT']
df_est_immigration = df_est_immigration.drop(drop_cols_2, axis=1)

In [117]:
df_est_immigration.shape

(504, 4)

In [118]:
#check for missing values in dataset
df_est_immigration.isnull().sum()

Year            0
Sex             0
Citizenship     0
VALUE          27
dtype: int64

In [119]:
#view rows with missing values
df_est_immigration[df_est_immigration.isnull().any(axis=1)]

Unnamed: 0,Year,Sex,Citizenship,VALUE
2,1996,Both sexes,EU15 to EU27 (accession countries joined post ...,
8,1996,Male,EU15 to EU27 (accession countries joined post ...,
14,1996,Female,EU15 to EU27 (accession countries joined post ...,
20,1997,Both sexes,EU15 to EU27 (accession countries joined post ...,
26,1997,Male,EU15 to EU27 (accession countries joined post ...,
32,1997,Female,EU15 to EU27 (accession countries joined post ...,
38,1998,Both sexes,EU15 to EU27 (accession countries joined post ...,
44,1998,Male,EU15 to EU27 (accession countries joined post ...,
50,1998,Female,EU15 to EU27 (accession countries joined post ...,
56,1999,Both sexes,EU15 to EU27 (accession countries joined post ...,


In [120]:
#drop rows with missing values
df_est_immigration = df_est_immigration.dropna()
df_est_immigration.isnull().sum()

Year           0
Sex            0
Citizenship    0
VALUE          0
dtype: int64

In [121]:
df_est_immigration.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477 entries, 0 to 503
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         477 non-null    int64  
 1   Sex          477 non-null    object 
 2   Citizenship  477 non-null    object 
 3   VALUE        477 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 18.6+ KB


In [122]:
#drop duplicates rows if any
df_est_immigration = df_est_immigration.drop_duplicates(keep='first')

In [123]:
#rename VALUE column to more specific name
df_est_immigration.rename(columns={'VALUE':'Population (t)'}, inplace = True)
df_est_immigration.columns

Index(['Year', 'Sex', 'Citizenship', 'Population (t)'], dtype='object')

> ## Estimated Emigration datset

In [124]:
df_est_emigration.shape

(324, 6)

In [125]:
df_est_emigration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  324 non-null    object 
 1   Year             324 non-null    int64  
 2   Sex              324 non-null    object 
 3   Citizenship      324 non-null    object 
 4   UNIT             324 non-null    object 
 5   VALUE            324 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 15.3+ KB


In [126]:
#list the columns from dataset
df_est_emigration.columns

Index(['Statistic Label', 'Year', 'Sex', 'Citizenship', 'UNIT', 'VALUE'], dtype='object')

In [127]:
#view top 5 rows
df_est_emigration.head()

Unnamed: 0,Statistic Label,Year,Sex,Citizenship,UNIT,VALUE
0,Estimated Emigration (Persons in April),2006,Both sexes,All Countries,Thousand,36.0
1,Estimated Emigration (Persons in April),2006,Both sexes,EU14 excl Irl (countries in the EU pre 2004 ex...,Thousand,5.1
2,Estimated Emigration (Persons in April),2006,Both sexes,EU15 to EU27 (accession countries joined post ...,Thousand,7.2
3,Estimated Emigration (Persons in April),2006,Both sexes,United Kingdom,Thousand,2.2
4,Estimated Emigration (Persons in April),2006,Both sexes,Ireland,Thousand,15.3


In [128]:
#view last 5 rows
df_est_emigration.tail()

Unnamed: 0,Statistic Label,Year,Sex,Citizenship,UNIT,VALUE
319,Estimated Emigration (Persons in April),2023,Female,EU14 excl Irl (countries in the EU pre 2004 ex...,Thousand,4.3
320,Estimated Emigration (Persons in April),2023,Female,EU15 to EU27 (accession countries joined post ...,Thousand,3.1
321,Estimated Emigration (Persons in April),2023,Female,United Kingdom,Thousand,2.7
322,Estimated Emigration (Persons in April),2023,Female,Ireland,Thousand,16.0
323,Estimated Emigration (Persons in April),2023,Female,"All countries excluding Ireland,United Kingdom...",Thousand,6.8


In [129]:
#check if different UNIT value is present in the dataset using pandas.Series.any method
~(df_est_emigration['UNIT'].isin(['Thousand'])).any()

False

In [130]:
#check if different STATISTIC Label value is present in the dataset using pandas.Series.any method
~(df_est_emigration['Statistic Label'].isin(['Estimated Emigration (Persons in April)'])).any()

False

In [131]:
#drop columns that are not important or useful for analysis
drop_cols_2 = ['Statistic Label', 'UNIT']
df_est_emigration = df_est_emigration.drop(drop_cols_2, axis=1)
df_est_emigration.shape

(324, 4)

In [132]:
#check for missing values in dataset
df_est_emigration.isnull().sum()

Year           0
Sex            0
Citizenship    0
VALUE          0
dtype: int64

In [133]:
#drop duplicates rows if any
df_est_emigration = df_est_emigration.drop_duplicates(keep='first')

In [134]:
#rename VALUE column to more specific name
df_est_emigration.rename(columns={'VALUE':'Population (t)'}, inplace = True)
df_est_emigration.columns

Index(['Year', 'Sex', 'Citizenship', 'Population (t)'], dtype='object')

# Exploratory Data Analysis
---

In [None]:
# imprting required libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Statistical Analysis
---

> ## Descriptive statistics

In [36]:
#get descriptive statistics
df_est_population.describe()

Unnamed: 0,Year,Population (t)
count,6669.0,6669.0
mean,2017.0,75.67046
std,3.741938,281.512172
min,2011.0,1.2
25%,2014.0,12.5
50%,2017.0,20.7
75%,2020.0,42.2
max,2023.0,5281.6


In [88]:
df_est_immigration.describe()

Unnamed: 0,Year,Population (t)
count,477.0,477.0
mean,2010.037736,18.060168
std,7.956198,21.330315
min,1996.0,1.1
25%,2003.0,5.0
50%,2010.0,9.7
75%,2017.0,22.6
max,2023.0,151.1


In [89]:
df_est_emigration.describe()

Unnamed: 0,Year,Population (t)
count,324.0,324.0
mean,2014.5,13.737963
std,5.196152,16.095976
min,2006.0,0.6
25%,2010.0,3.6
50%,2014.5,6.7
75%,2019.0,17.8
max,2023.0,83.0


> ## Binomal Distribution

> ## Poisson Distribution

> ## Normal Distribution

# Machine Learning Analysis
---