In [4]:
import pandas as pd
from scipy.stats.mstats import winsorize 
from pathlib import Path

# Get the base directory = parent of the current notebook
BASE_DIR = Path.cwd().parent   

# build data path
data_path = BASE_DIR / "1. data" / "processed"

# read both files
df_prowess = pd.read_csv(data_path / "Prowess_IT_Data_Processed.csv")
df_acetp   = pd.read_csv(data_path / "ACETP_IT_Data_Processed.csv")

# add source column
df_prowess["source"] = "prowess"
df_acetp["source"]   = "acetp"

# align columns using Prowess column order (plus 'source')
common_cols = [col for col in df_prowess.columns if col in df_acetp.columns]

# filter ACETP rows where cin is not in Prowess
df_acetp_unique = df_acetp[~df_acetp["cin"].isin(df_prowess["cin"])]

# combine
df_final = pd.concat([df_prowess[common_cols], df_acetp_unique[common_cols]], ignore_index=True)

print("Prowess rows:", df_prowess.shape[0])
print("ACETP unique rows added:", df_acetp_unique.shape[0])
print("Final shape:", df_final.shape)

df_final["pli"] = winsorize(df_final["pli"], limits=[0.01, 0.01])

# save to CSV
df_final.to_csv(data_path / "Combined_IT_Data_Final.csv", index=False)


Prowess rows: 5787
ACETP unique rows added: 1459
Final shape: (7246, 15)
