In [None]:
import os
from pathlib import Path

import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"


current_dir = Path(os.getcwd())

PROJECT = current_dir.parent.resolve()
SRC = PROJECT.parent.resolve()
ROOT = PROJECT.joinpath("..", "..").resolve()

BLD = ROOT.joinpath("bld").resolve()

# Folders inside idos_ppp
DATA = PROJECT.joinpath("data").resolve()
DATA_MGT = PROJECT.joinpath("data_management").resolve()
ANALYSIS = PROJECT.joinpath("analysis").resolve()
FINAL = PROJECT.joinpath("final").resolve()

DOCUMENTS = ROOT.joinpath("documents").resolve()

print(ROOT)

# TRY THE NEW DATASET FOR POSSIBLE ERRORS

In [None]:
from idos_ppp.config import DATA
from idos_ppp.parameters import sheet_names

data_file = DATA
raw_data_path = DATA / "Data_2007_2010_2013_2016_2019.xlsx"

# Open the dataset file using pandas
try:
    raw_dta = {
        sheet: pd.read_excel(raw_data_path, sheet_name=sheet, header=1)
        for sheet in sheet_names
    }
    print("Data loaded successfully.")
    # Display the first few rows of one of the dataframes
    for sheet, df in raw_dta.items():
        print(f"First few rows of {sheet}:")
        print(df.head())
except FileNotFoundError:
    print(f"Error: The file {raw_data_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
def are_csv_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_csv(file_path1)
        df2 = pd.read_csv(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "data")  # Replace with your directory path
file1 = directory / "clean_data_new.csv"
file2 = directory / "clean_data.csv"

if are_csv_files_identical(file1, file2):
    print("The CSV files are identical.")
else:
    print("The CSV files are not identical.")

In [None]:
clean_data_2 = pd.read_csv(BLD / "data" / "clean_data.csv")
clean_data_2.describe

In [None]:
clean_data_2.iloc[:, 0]

In [None]:
clean_data_2

In [None]:
from idos_ppp.parameters import middle_east_north_africa_countries

# Check if a particular country is in the "country_name" column
for country in middle_east_north_africa_countries:
    if country in clean_data_2["country_name"].values:
        print(f"{country} is in the Country column.")
    else:
        print(f"{country} is not in the Country column.")

# PLAY WITH LISTS OF COUNTRIES

In [None]:
european_union = [
    "Austria",
    "Belgium",
    "Bulgaria",
    "Croatia",
    "Cyprus",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "Finland",
    "France",
    "Germany",
    "Greece",
    "Hungary",
    "Ireland",
    "Italy",
    "Latvia",
    "Lithuania",
    "Luxembourg",
    "Malta",
    "Netherlands",
    "Poland",
    "Portugal",
    "Romania",
    "Slovak Republic",
    "Slovenia",
    "Spain",
    "Sweden",
]
len(european_union)

In [None]:
# Conflic / Post-conflict (from Heydemann 2025)
conflict_countries = ["Libya", "Syrian Arab Republic", "Yemen, Rep."]

conflict_and_postconflict_countries = [
    "Iraq",
    "Lebanon",
    "Libya",
    "Syrian Arab Republic",
    "Yemen, Rep.",
]

# GCC countries (and repressive ones) (from Heydemann 2025)
gcc_high_income_countries = [
    "Bahrain",
    "Kuwait",
    "Oman",
    "Qatar",
    "Saudi Arabia",
    "United Arab Emirates",
]

repressive_countries = [
    "Egypt, Arab Rep.",
    "Jordan",
    "Morocco",
    "Tunisia",
]  # Tunisia post-2020

gcc_and_repressive_countries = [
    "Bahrain",
    "Kuwait",
    "Oman",
    "Qatar",
    "Saudi Arabia",
    "United Arab Emirates",
    "Egypt, Arab Rep.",
    "Jordan",
    "Morocco",
    "Tunisia",
]  # Tunisia post-2020

country_lists = {
    "conflict_countries": conflict_countries,
    "conflict_and_postconflict_countries": conflict_and_postconflict_countries,
    "gcc_high_income_countries": gcc_high_income_countries,
    "repressive_countries": repressive_countries,
    "gcc_and_repressive_countries": gcc_and_repressive_countries,
}
print(
    *[
        len(
            [
                country
                for country in country_list
                if country in clean_data_2["country_name"].values
            ]
        )
        for country_list in country_lists.values()
    ],
    sep=", ",
)

In [None]:
## Check Merged Dataset


# Define the file paths
pkl_file_path = BLD / "data" / "merged_data.pkl"
csv_file_path = BLD / "data" / "merged_data.csv"

# Load the .pkl file
df = pd.read_pickle(pkl_file_path)

# Save as CSV
df.to_csv(csv_file_path)

# Display the DataFrame
print(list(df["country_name"]))


for country in european_union:
    if country in df["country_name"].values:
        print(f"{country} is in the Country column.")
    else:
        print(f"{country} is not in the Country column.")

In [None]:
# Display the DataFrames
output_dir = BLD / "data" / "subsets"
european_union_countries_data = pd.read_pickle(
    output_dir / "european_union_countries_data.pkl"
)
european_union_countries_data

In [None]:
merged_data = pd.read_pickle(BLD / "data" / "merged_data.pkl")
# print(merged_data.head())
print(merged_data.columns)
print(len(merged_data.columns))
# print(merged_data.describe())
# print(merged_data.dtypes)
# print(merged_data.shape)
# print(merged_data.isnull().sum()) # To see if there are any missing values in the dataset.
# for column in merged_data.select_dtypes(include=['object']).columns:
#     print(f"Unique values in {column}: {merged_data[column].unique()}") # If you have categorical columns, you might want to see the unique values.
# float_columns = merged_data.select_dtypes(include=['float64'])
# correlation_matrix = float_columns.corr()
# print(correlation_matrix)
# print(merged_data.info())

In [None]:
from idos_ppp.analysis.idos_dataanalysis import (
    calculate_yearly_prot_prov_continent_correlations,
    calculate_yearly_prot_prov_correlations,
)

# Display the DataFrames
input_dir = BLD / "data" / "subsets"
output_dir_1 = BLD / "analysis" / "yearly_prot_prov_correlations.arrow"
output_dir_2 = BLD / "analysis" / "yearly_prot_prov_correlations_continent.arrow"


try:
    european_union_countries_data = pd.read_pickle(
        input_dir / "european_union_countries_data.pkl"
    )
    yearly_correlations_df = calculate_yearly_prot_prov_correlations(
        european_union_countries_data
    )
    yearly_correlations_continent_df = (
        calculate_yearly_prot_prov_continent_correlations(european_union_countries_data)
    )
    yearly_correlations_df.to_feather(output_dir_1)
    yearly_correlations_continent_df.to_feather(output_dir_2)
    print("Data loaded successfully.")
    # Display the first few rows of the dataframe
    print(yearly_correlations_df, yearly_correlations_continent_df)
except FileNotFoundError:
    print(f"Error: The file {input_dir} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
def are_feather_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_feather(file_path1)
        df2 = pd.read_feather(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "analysis")  # Replace with your directory path
file1 = directory / "yearly_correlations_continent.arrow"
file2 = directory / "yearly_continent_correlations.arrow"

if are_feather_files_identical(file1, file2):
    print("The arrow files are identical.")
else:
    print("The arrow files are not identical.")