In [None]:
import os
from pathlib import Path

import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"


current_dir = Path(os.getcwd())

PROJECT = current_dir.parent.resolve()
SRC = PROJECT.parent.resolve()
ROOT = PROJECT.joinpath("..", "..").resolve()

BLD = ROOT.joinpath("bld").resolve()

# Folders inside idos_ppp
DATA = PROJECT.joinpath("data").resolve()
DATA_MGT = PROJECT.joinpath("data_management").resolve()
ANALYSIS = PROJECT.joinpath("analysis").resolve()
FINAL = PROJECT.joinpath("final").resolve()

DOCUMENTS = ROOT.joinpath("documents").resolve()

print(ROOT)

# THE ORIGINAL DATASET

In [None]:
from idos_ppp.config import DATA
from idos_ppp.parameters import sheet_names

data_file = DATA
raw_data_path = DATA / "Data_2007_2010_2013_2016_2019.xlsx"

# Open the dataset file using pandas
try:
    raw_dta = {
        sheet: pd.read_excel(raw_data_path, sheet_name=sheet, header=1)
        for sheet in sheet_names
    }
    print("Data loaded successfully.")
    # Display the first few rows of one of the dataframes
    for sheet, df in raw_dta.items():
        print(f"First few rows of {sheet}:")
        print(df.head())
except FileNotFoundError:
    print(f"Error: The file {raw_data_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
'''
def are_csv_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_csv(file_path1)
        df2 = pd.read_csv(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "data")  # Replace with your directory path
file1 = directory / "clean_data_new.csv"
file2 = directory / "clean_data.csv"

if are_csv_files_identical(file1, file2):
    print("The CSV files are identical.")
else:
    print("The CSV files are not identical.")
'''

# THE CLEANED DATASET

In [None]:
clean_data_2 = pd.read_csv(BLD / "data" / "clean_data.csv")
clean_data_2.describe

In [None]:
clean_data_2.iloc[:, 0]

In [None]:
clean_data_2

# SUBSETS, ANALYSIS AND PLOTTING

In [None]:
# Display the DataFrames
output_dir = BLD / "data" / "subsets"
european_union_countries_data = pd.read_pickle(
    output_dir / "european_union_countries_data.pkl",
)
european_union_countries_data.columns

In [None]:
merged_data = pd.read_pickle(BLD / "data" / "merged_data.pkl")
# print(merged_data.head())
print(merged_data.columns)
print(len(merged_data.columns))
# print(merged_data.describe())
# print(merged_data.dtypes)
# print(merged_data.shape)
# print(merged_data.isnull().sum()) # To see if there are any missing values in the dataset.
# for column in merged_data.select_dtypes(include=['object']).columns:
#     print(f"Unique values in {column}: {merged_data[column].unique()}") # If you have categorical columns, you might want to see the unique values.
# float_columns = merged_data.select_dtypes(include=['float64'])
# correlation_matrix = float_columns.corr()
# print(correlation_matrix)
# print(merged_data.info())

In [None]:
'''
def are_feather_files_identical(file_path1, file_path2):
    """Check if two CSV files are identical.

    Parameters:
    - file_path1: str or Path, path to the first CSV file.
    - file_path2: str or Path, path to the second CSV file.

    Returns:
    - bool: True if the files are identical, False otherwise.
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_feather(file_path1)
        df2 = pd.read_feather(file_path2)

        # Compare the DataFrames
        return df1.equals(df2)
    except Exception as e:
        print(f"An error occurred: {e}")
        return False


# Example usage
directory = Path(BLD / "analysis")  # Replace with your directory path
file1 = directory / "yearly_correlations_continent.arrow"
file2 = directory / "yearly_continent_correlations.arrow"

if are_feather_files_identical(file1, file2):
    print("The arrow files are identical.")
else:
    print("The arrow files are not identical.")
'''

In [None]:
prova3_stat_yrl_data = pd.read_pickle(
    BLD
    / "analysis"
    / "prot_prov_correlations"
    / "yearly_prot_prov_continent_correlations.pkl",
)
prova3_stat_yrl_data

In [None]:
from idos_ppp.parameters import country_lists

inputs_stat_plots = {
    list_name: BLD
    / "analysis"
    / "statistical_analysis"
    / f"{list_name}_yearly_statistics.pkl"
    for list_name in country_lists.keys()
}
inputs_stat_plots["merged_dataframe_countries"] = (
    BLD / "analysis" / "statistical_analysis" / "merged_dataframe_yearly_statistics.pkl"
)

inputs_stat_plots

In [None]:
from idos_ppp.parameters import country_lists

prova3_data = pd.read_pickle(
    BLD / "data" / "subsets" / "conflict_and_postconflict_countries_data.pkl",
)
prova3_data

In [None]:
conflict_and_postconflict_countries_data = pd.read_pickle(
    BLD / "data" / "subsets" / "conflict_and_postconflict_countries_data.pkl",
)
conflict_and_postconflict_countries_data