In [1]:
import os
from pathlib import Path

import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"


current_dir = Path(os.getcwd())

PROJECT = current_dir.parent.resolve()
SRC = PROJECT.parent.resolve()
ROOT = PROJECT.joinpath("..", "..").resolve()

BLD = ROOT.joinpath("bld").resolve()

# Folders inside idos_ppp
DATA = PROJECT.joinpath("data").resolve()
DATA_MGT = PROJECT.joinpath("data_management").resolve()
ANALYSIS = PROJECT.joinpath("analysis").resolve()
FINAL = PROJECT.joinpath("final").resolve()

DOCUMENTS = ROOT.joinpath("documents").resolve()

print(ROOT)

/Users/sergeimolinari/Desktop/IDOS/IDOS-PPP


# TRY THE NEW DATASET FOR POSSIBLE ERRORS

In [2]:
from idos_ppp.config import DATA
from idos_ppp.parameters import sheet_names

data_file = DATA
raw_data_path = DATA / "Data_2007_2010_2013_2016_2019.xlsx"

# Open the dataset file using pandas
try:
    raw_dta = {
        sheet: pd.read_excel(raw_data_path, sheet_name=sheet, header=1)
        for sheet in sheet_names
    }
    print("Data loaded successfully.")
    # Display the first few rows of one of the dataframes
    for sheet, df in raw_dta.items():
        print(f"First few rows of {sheet}:")
        print(df.head())
except FileNotFoundError:
    print(f"Error: The file {raw_data_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully.
First few rows of Transformed 2007:
    Unnamed: 0   Unnamed: 1 FFP X1   1-2 UCDP WEF GCI Subpillar: 5. Security  \
0  Afghanistan  Afghanistan       0         0                            XXX   
1      Albania      Albania    0.44         1                       0.691667   
2      Algeria      Algeria    0.43  0.458492                       0.768333   
3       Angola       Angola    0.24  0.899611                           0.86   
4    Argentina    Argentina    0.72         1                           0.69   

  1-4 Political Terror 1-5 FFP P3 Protection  \
0                    0       0.18      0.045   
1                 0.75       0.46   0.668333   
2                 0.25       0.26       0.43   
3                  0.5       0.25        0.5   
4                 0.75       0.63       0.72   

  WEF GCR 2nd pillar: Infrastructure 2-1-2 UN: Telecom (2008)   ...  \
0                                XXX                   0.01576  ...   
1                         

In [None]:
'''
def are_csv_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_csv(file_path1)
        df2 = pd.read_csv(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "data")  # Replace with your directory path
file1 = directory / "clean_data_new.csv"
file2 = directory / "clean_data.csv"

if are_csv_files_identical(file1, file2):
    print("The CSV files are identical.")
else:
    print("The CSV files are not identical.")
'''

In [3]:
clean_data_2 = pd.read_csv(BLD / "data" / "clean_data.csv")
clean_data_2.describe

<bound method NDFrame.describe of     country_alpha3 country_name  year  1_1_ffp_x1  1_2_ucdp  \
0              AFG  Afghanistan  2007        0.00  0.000000   
1              ALB      Albania  2007        0.44  1.000000   
2              DZA      Algeria  2007        0.43  0.458492   
3              AGO       Angola  2007        0.24  0.899611   
4              ARG    Argentina  2007        0.72  1.000000   
..             ...          ...   ...         ...       ...   
725            URY      Uruguay  2019        0.77  1.000000   
726            VNM      Vietnam  2019        0.51  1.000000   
727            YEM  Yemen, Rep.  2019        0.00  0.000000   
728            ZMB       Zambia  2019        0.30  1.000000   
729            ZWE     Zimbabwe  2019        0.27  1.000000   

     1_3_wef_gci_subpillar:_5._security  1_4_political_terror  1_5_ffp_p3  \
0                                   NaN                  0.00        0.18   
1                              0.691667                

In [None]:
clean_data_2.iloc[:, 0]

In [None]:
clean_data_2

In [None]:
from idos_ppp.parameters import middle_east_north_africa_countries

# Check if a particular country is in the "country_name" column
for country in middle_east_north_africa_countries:
    if country in clean_data_2["country_name"].values:
        print(f"{country} is in the Country column.")
    else:
        print(f"{country} is not in the Country column.")

# PLAY WITH LISTS OF COUNTRIES

In [10]:
# Display the DataFrames
output_dir = BLD / "data" / "subsets"
european_union_countries_data = pd.read_pickle(
    output_dir / "european_union_countries_data.pkl",
)
european_union_countries_data.columns

Index(['country_name', 'continent', '1_1_ffp_x1', '1_2_ucdp',
       '1_3_wef_gci_subpillar:_5._security', '1_4_political_terror',
       '1_5_ffp_p3', 'protection', '2_1_1_wef_gcr_2nd_pillar:_infrastructure',
       '2_1_2_un:_telecom',
       '2_2_1_government_expend._on_primary_and_secondary_education,_total_(%_of_gdp)',
       '2_2_2_wef_gcr_indicator_quality_of_primary_education',
       '2_3_1_domestic_general_government_health_expenditure_(%_of_gdp)',
       '2_3_2_out_of_pocket_expenditure_(%_of_total_national_health_care_spending)',
       '2_4_1_public_social_protection_(excl._health)_expenditure_(%_of_gdp)',
       '2_4_2_coverage_of_older_persons_by_sp_benefits',
       '2_5_1_public_expenditure_on_socials_safety_nets_(%_of_gdp)',
       '2_5_2_vulnerable_persons_covered_by_social_assistance',
       '2_6_1_share_of_wage_employment_on_work_age_pop',
       '2_6_2_working_poverty_head_count_rate_(percentage_of_persons_living_in_poverty_in_spite_of_being_employed)',
       '2

In [11]:
merged_data = pd.read_pickle(BLD / "data" / "merged_data.pkl")
# print(merged_data.head())
print(merged_data.columns)
print(len(merged_data.columns))
# print(merged_data.describe())
# print(merged_data.dtypes)
# print(merged_data.shape)
# print(merged_data.isnull().sum()) # To see if there are any missing values in the dataset.
# for column in merged_data.select_dtypes(include=['object']).columns:
#     print(f"Unique values in {column}: {merged_data[column].unique()}") # If you have categorical columns, you might want to see the unique values.
# float_columns = merged_data.select_dtypes(include=['float64'])
# correlation_matrix = float_columns.corr()
# print(correlation_matrix)
# print(merged_data.info())

Index(['country_name', 'continent', '1_1_ffp_x1', '1_2_ucdp',
       '1_3_wef_gci_subpillar:_5._security', '1_4_political_terror',
       '1_5_ffp_p3', 'protection', '2_1_1_wef_gcr_2nd_pillar:_infrastructure',
       '2_1_2_un:_telecom',
       '2_2_1_government_expend._on_primary_and_secondary_education,_total_(%_of_gdp)',
       '2_2_2_wef_gcr_indicator_quality_of_primary_education',
       '2_3_1_domestic_general_government_health_expenditure_(%_of_gdp)',
       '2_3_2_out_of_pocket_expenditure_(%_of_total_national_health_care_spending)',
       '2_4_1_public_social_protection_(excl._health)_expenditure_(%_of_gdp)',
       '2_4_2_coverage_of_older_persons_by_sp_benefits',
       '2_5_1_public_expenditure_on_socials_safety_nets_(%_of_gdp)',
       '2_5_2_vulnerable_persons_covered_by_social_assistance',
       '2_6_1_share_of_wage_employment_on_work_age_pop',
       '2_6_2_working_poverty_head_count_rate_(percentage_of_persons_living_in_poverty_in_spite_of_being_employed)',
       '2

In [None]:
'''
def are_feather_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_feather(file_path1)
        df2 = pd.read_feather(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "analysis")  # Replace with your directory path
file1 = directory / "yearly_correlations_continent.arrow"
file2 = directory / "yearly_continent_correlations.arrow"

if are_feather_files_identical(file1, file2):
    print("The arrow files are identical.")
else:
    print("The arrow files are not identical.")
'''

In [None]:
prova3_stat_yrl_data = pd.read_pickle(BLD / "analysis" / "statistical_analysis" / "middle_east_north_africa_countries_yearly_statistics.pkl")
prova3_stat_yrl_data

MultiIndex([(         'year',       ''),
            (   'protection',   'mean'),
            (   'protection', 'median'),
            (   'protection',    'std'),
            (   'protection',    'min'),
            (   'protection',    'max'),
            (    'provision',   'mean'),
            (    'provision', 'median'),
            (    'provision',    'std'),
            (    'provision',    'min'),
            (    'provision',    'max'),
            ('participation',   'mean'),
            ('participation', 'median'),
            ('participation',    'std'),
            ('participation',    'min'),
            ('participation',    'max'),
            (   'protection',  'range'),
            (    'provision',  'range'),
            ('participation',  'range')],
           )