## By sector feature selection:

In [45]:
import os
import pandas as pd

# Specify the path to your data folder
data_folder = r'C:\Users\matth\Risk-Lab-\data'

# Initialize an empty list to hold each company's transposed DataFrame
data_frames = []

# Traverse through all subfolders and files in the data folder
for root, _, files in os.walk(data_folder):
    for file in files:
        if file.endswith('.csv'):
            # Generate the full path for each CSV file
            file_path = os.path.join(root, file)
            
            # Read the CSV file into a DataFrame (assuming index is the date)
            df = pd.read_csv(file_path, index_col=0)
            
            # Transpose the DataFrame
            df_transposed = df.T
            
            # Prefix columns with the first four letters of the file name
            prefix = file[:4].upper()  # Uppercase prefix for consistency
            df_transposed.columns = [f"{prefix}_{col}" for col in df_transposed.columns]
            
            # Reset index and rename to include a unique Date column
            df_transposed.reset_index(inplace=True)
            df_transposed = df_transposed.rename(columns={'index': f'Date_{prefix}'})
            
            # Drop any duplicate columns in this individual DataFrame
            df_transposed = df_transposed.loc[:, ~df_transposed.columns.duplicated()]
            
            # Append the cleaned DataFrame to the list
            data_frames.append(df_transposed)

# Concatenate all DataFrames along the columns
combined_df = pd.concat(data_frames, axis=1)

# Consolidate date columns into a single 'Date' column
date_columns = [col for col in combined_df.columns if col.startswith('Date_')]
combined_df['Date'] = pd.to_datetime(combined_df[date_columns].bfill(axis=1).iloc[:, 0], errors='coerce')

# Drop the individual 'Date_' columns after creating a main 'Date' column
combined_df = combined_df.drop(columns=date_columns).set_index('Date').sort_index()

# Display the final combined DataFrame
print(combined_df)
# combined_df.to_csv('path_to_save_combined_data.csv')  # Uncomment to save to a CSV file


                                               QUAT_nan QUAT_Total Assets  \
Date                                                                        
NaT   "=BDH($A$1, $B8, "19900101","","sort=a","dates...               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
...                                                 ...               ...   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   
NaT                                                 NaN               NaN   

In [46]:
combined_df.head()

Unnamed: 0_level_0,QUAT_nan,QUAT_Total Assets,"QUAT_ + Cash, Cash Equivalents & STI",QUAT_ + Cash & Cash Equivalents,QUAT_ + ST Investments,QUAT_ + Accounts & Notes Receiv,"QUAT_ + Accounts Receivable, Net","QUAT_ + Notes Receivable, Net",QUAT_ + Unbilled Revenues,QUAT_ + Inventories,...,SO_Y_Nominations & Governance Oversight,SO_Y_Size of Nomination Committee,SO_Y_Num of Independent Directors on Nomination Cmte,SO_Y_Number of Nomination Committee Meetings,SO_Y_Nomination Committee Meeting Attendance Percentage,SO_Y_Sustainability Governance,SO_Y_Verification Type,SO_Y_Employee CSR Training,SO_Y_Tenure,SO_Y_Board Duration (Years)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NaT,"""=BDH($A$1, $B8, ""19900101"","""",""sort=a"",""dates...",,CASH_CASH_EQTY_STI_DETAILED,BS_CASH_NEAR_CASH_ITEM,BS_MKT_SEC_OTHER_ST_INVEST,BS_ACCT_NOTE_RCV,BS_ACCTS_REC_EXCL_NOTES_REC,NOTES_RECEIVABLE,BS_UNBILLED_REVENUES,BS_INVENTORIES,...,,SIZE_OF_NOMINATION_COMMITTEE,NUM_IND_DIR_ON_NOM_CMTE,NUM_OF_NOMINATION_CMTE_MTG,NOMINATION_CMTE_MTG_ATTEND_PCT,,VERIFICATION_TYPE,EMPLOYEE_CSR_TRAINING,,BOARD_DURATION
NaT,,,,,,,,,,,...,#N/A Mandatory parameter [FIELDS] cannot be empty,,,,,#N/A Mandatory parameter [FIELDS] cannot be empty,,,#N/A Mandatory parameter [FIELDS] cannot be empty,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,


In [44]:
import os
import pandas as pd

# Specify the path to your data folder
data_folder = r'C:\Users\matth\Risk-Lab-\data'

# Initialize an empty list to hold each company's transposed DataFrame
data_frames = []

# Traverse through all subfolders and files in the data folder
for root, _, files in os.walk(data_folder):
    for file in files:
        if file.endswith('.csv'):
            # Generate the full path for each CSV file
            file_path = os.path.join(root, file)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Remove the first five rows
            df_cleaned = df.iloc[5:]
            
            # Extract the date row (row 6) and drop the first value and the first two columns
            date_row = df_cleaned.iloc[0, 1:].values  # Remove the first value
            df_cleaned = df_cleaned.iloc[:, 2:]  # Remove the first two columns
            
            # Transpose the DataFrame
            df_transposed = df_cleaned.T
            
            # Ensure that date_row matches the length of the transposed DataFrame's index
            if len(date_row) != df_transposed.shape[0]:
                print(f"Warning: Length of date_row ({len(date_row)}) does not match transposed DataFrame index ({df_transposed.shape[0]}) for file {file}. Adjusting date_row to match.")
                date_row = date_row[:df_transposed.shape[0]]  # Trim or handle as needed

            # Set the date row as the index
            df_transposed.index = date_row
            
            # Prefix columns with the first four letters of the file name
            prefix = file[:4].upper()  # Uppercase prefix for consistency
            df_transposed.columns = [f"{prefix}_{col}" for col in df_transposed.columns]
            
            # Drop any duplicate indices in the transposed DataFrame
            df_transposed = df_transposed[~df_transposed.index.duplicated(keep='first')]
            
            # Append the cleaned DataFrame to the list
            data_frames.append(df_transposed)

# Concatenate all DataFrames along the columns
if data_frames:  # Check if there are DataFrames to concatenate
    combined_df = pd.concat(data_frames, axis=1)

    # Determine the largest index value
    largest_index_value = combined_df.index.max()

    # Create a new row called 'index' that matches the largest index value
    index_row = pd.Series([largest_index_value] * combined_df.shape[1], index=combined_df.columns, name='index')
    
    # Append the new index row to the DataFrame
    combined_df = combined_df.append(index_row)

    # Display the draft combined DataFrame
    print(combined_df)

    # Save the combined DataFrame to a CSV file if needed
    # combined_df.to_csv('path_to_save_combined_data.csv')
else:
    print("No DataFrames to combine.")










TypeError: '>=' not supported between instances of 'str' and 'float'

In [39]:
combined_df.head()

Unnamed: 0,Date_1,Date_2,Date_3,Date_4,Date_5,Date_6,Date_7,Date_8,Date_9,Date_10,...,SO_Y_294,SO_Y_295,SO_Y_296,SO_Y_297,SO_Y_298,SO_Y_299,SO_Y_300,SO_Y_301,SO_Y_302,SO_Y_303
0,,,,,,,,,,,...,,,,#N/A Mandatory parameter [FIELDS] cannot be empty,#N/A Mandatory parameter [FIELDS] cannot be empty,,,#N/A Mandatory parameter [FIELDS] cannot be empty,#N/A Mandatory parameter [FIELDS] cannot be empty,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1031.500977,1.0,1256.516,1.0,1384.495,1.0,1578.456,1.0,1919.093,1.0,...,,,,,,,,,,
