In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm

In [2]:

def process_file(file_path, text_columns_start_row=0):
    """
    Processes a text file to handle numeric columns and filter NaN values.
    
    Parameters:
    - file_path: Path to the text file.
    - text_columns_start_row: Row index from where to start processing text columns (default is 1).
    
    Returns:
    - recombined_df: A DataFrame with filtered numeric and text columns.
    """
    # Step 1: Load the data into a pandas DataFrame
    df = pd.read_csv(file_path, delimiter='\t', low_memory=False)
    
    # Step 2: Select the first two text columns
    df_first_two_columns = df.iloc[text_columns_start_row:, :2]  # First two columns
    
    # Step 3: Select the numeric columns (all except the first two and last two)
    df_numeric = df.iloc[text_columns_start_row:, 2:-2]  # Select numeric columns dynamically
    
    # Step 4: Apply to_numeric to convert non-numeric values to NaN. Exclude the description row and then add it back.
    first_row = df_numeric.iloc[0]
    rest_of_df = df_numeric.iloc[1:]
    df_numeric = rest_of_df.apply(pd.to_numeric, errors='coerce')
    df_numeric = pd.concat([pd.DataFrame([first_row]), df_numeric], ignore_index=True)

    # Step 5: Retain the last two text columns
    df_last_two_columns = df.iloc[text_columns_start_row:, -2:]  # Last two columns
    
    # Step 6: Filter rows with no NaN values in the numeric columns
    non_nan_filtered = df_numeric.dropna()
    
    # Step 7: Filter out columns that have all NaN values in the numeric subset

    # Function to check if all elements except the first one are NaN
    def should_drop(column):
        return column[1:].isna().all()

    # Identify columns to drop
    columns_to_drop = [col for col in df_numeric.columns if should_drop(df_numeric[col])]

    # Drop those columns
    non_nan_columns = df_numeric.drop(columns=columns_to_drop)
    
    # Step 8: Recombine the numeric subset, first two text columns, and last two text columns
    recombined_df = pd.concat([df_first_two_columns, non_nan_columns, df_last_two_columns], axis=1)

    # Step 9: Check that collection_title or promoted_subjectkey are both there, otherwise print a warning
    description_row = recombined_df.iloc[0]
    desc_contains_collection_title = description_row.apply(lambda x: 'collection_title' in str(x)).any()
    desc_contains_promoted_subjectkey = description_row.apply(lambda x: 'promoted_subjectkey' in str(x)).any()
    cols_contain_collection_title = 'collection_title' in recombined_df.columns
    cols_contain_promoted_subjectkey = 'promoted_subjectkey' in recombined_df.columns

    if not desc_contains_collection_title:
        print("Collection_title is missing from the column descriptions.")
    if not desc_contains_promoted_subjectkey:
        print("Promoted_subjectkey is missing from the column descriptions.")
    if not cols_contain_collection_title:
        print("Collection_title is missing from the column names.")
    if not cols_contain_promoted_subjectkey:
        print("Promoted_subjectkey is missing from the column names.")
    if desc_contains_collection_title and desc_contains_promoted_subjectkey and cols_contain_collection_title and cols_contain_promoted_subjectkey:
        return recombined_df

def process_all_files(root_folder, file_list, text_columns_start_row=0):
    """
    Processes all files in the list and returns the processed DataFrames.
    
    Parameters:
    - root_folder: Root folder containing all text files.
    - file_list: List of file names to process.
    - text_columns_start_row: Row index from where to start processing text columns (default is 1).
    
    Returns:
    - result_dict: Dictionary with file names as keys and processed DataFrames as values.
    """
    result_dict = {}
    
    for file_name in file_list:
        file_path = os.path.join(root_folder, file_name)
        print(f"Processing {file_name}...")
        
        # Process the file
        processed_df = process_file(file_path, text_columns_start_row)
        
        # Store the result in the dictionary
        if isinstance(processed_df, pd.DataFrame):
            result_dict[file_name] = processed_df
    
    return result_dict



In [3]:
#Get a list of all text files from the root folder

def get_text_files(directory):
    text_files = [f for f in os.listdir(directory) if f.endswith('.txt') and os.path.isfile(os.path.join(directory, f))]
    return text_files

root_folder = r"C:\Users\Senna\Desktop\Iigaya_lab\catie\catie_text_data\catie"
file_list = get_text_files(root_folder)
print(file_list)

['aesposys01.txt', 'aims01.txt', 'cata01.txt', 'cgis01.txt', 'clgry01.txt', 'dai01.txt', 'demo01.txt', 'dgsposys01.txt', 'dosecomp01.txt', 'ecg01.txt', 'endphase01.txt', 'endstudy01.txt', 'fint01.txt', 'hair01.txt', 'itaq01.txt', 'keyvars01.txt', 'lab01.txt', 'maccomp01.txt', 'macvlnce01.txt', 'med01.txt', 'meddispn01.txt', 'ndar_aggregate.txt', 'ndar_subject01.txt', 'neurobatt01.txt', 'package_info.txt', 'panss01.txt', 'qol01.txt', 'sae01.txt', 'scid_ph01.txt', 'screen01.txt', 'sf1201.txt', 'surf01.txt', 'surfq01.txt', 'timeto01.txt', 'viol01.txt', 'vitals01.txt']


In [4]:
# Process all files and store the results in a dictionary
processed_data = process_all_files(root_folder, file_list)
# processed_data['med01.txt']

Processing aesposys01.txt...
Processing aims01.txt...
Processing cata01.txt...
Processing cgis01.txt...
Processing clgry01.txt...
Processing dai01.txt...
Processing demo01.txt...
Processing dgsposys01.txt...
Processing dosecomp01.txt...
Processing ecg01.txt...
Processing endphase01.txt...
Processing endstudy01.txt...
Processing fint01.txt...
Processing hair01.txt...
Processing itaq01.txt...
Processing keyvars01.txt...
Processing lab01.txt...
Processing maccomp01.txt...
Processing macvlnce01.txt...
Processing med01.txt...
Processing meddispn01.txt...
Processing ndar_aggregate.txt...
Collection_title is missing from the column descriptions.
Promoted_subjectkey is missing from the column descriptions.
Collection_title is missing from the column names.
Promoted_subjectkey is missing from the column names.
Processing ndar_subject01.txt...
Processing neurobatt01.txt...
Processing package_info.txt...
Collection_title is missing from the column descriptions.
Promoted_subjectkey is missing from

In [5]:
#Merge all dfs which have promoted_subjectkey and a collection_title
#This is going to merge on all shared columns, doing outer to make sure all rows are kept. NaNs will go in where info does not exist for that row.

# Initial DataFrame (use the first DataFrame in the dictionary)
merged_df = list(processed_data.values())[0]

# Merge remaining DataFrames
for key in list(processed_data.keys())[1:]:
    merged_df = pd.merge(merged_df, processed_data[key], how='outer')

In [6]:
#other idea of what the column of medications for phase 1 is
#merged_df.loc['med2g']
merged_df['treat_1'].unique()

array(['Treatment for Phase 1', nan, 5.0, 1.0, 3.0, 2.0, 4.0],
      dtype=object)

In [None]:
# for each promoted subject key, get rows with non-NAN values of: treat_1, dcr_tae1, dcr_eff1
# with that promoted subject key as 'key', find the values for 'cocaine', 'opiates', 'pcp', 'meth', 'thc'.
# do logistic regression. 

In [7]:
# Step 1: Filter for non-NA 'treat_1' and get relevant 'promoted_subjectkey'
filtered_df = merged_df[merged_df['treat_1'].notna()]
promoted_keys = filtered_df['promoted_subjectkey'].unique()

In [9]:
# Step 2: Filter for the relevant 'promoted_subjectkey' and required columns
selected_df = merged_df[merged_df['promoted_subjectkey'].isin(promoted_keys)][[
    'promoted_subjectkey', 'dcr_tae1', 'dcr_eff1', 'cocaine', 'opiates', 'pcp', 'meth', 'thc'
]]

In [10]:
# Step 2b: Aggregate data by 'promoted_subjectkey'
aggregated_df = selected_df.groupby('promoted_subjectkey').agg({
    'dcr_tae1': 'first',  # take first non-NA occurrence of 'dcr_tae1'
    'cocaine': 'max',     # if any row has '1', this will capture it
    'opiates': 'max',
    'pcp': 'max',
    'meth': 'max',
    'thc': 'max'
}).dropna(subset=['dcr_tae1'])  # Ensure 'dcr_tae1' has values



In [13]:
X = aggregated_df[['cocaine', 'opiates', 'pcp', 'meth', 'thc']].apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(aggregated_df['dcr_tae1'], errors='coerce')

In [15]:
data = pd.concat([X, y], axis=1).dropna()
X = data[['cocaine', 'opiates', 'pcp', 'meth', 'thc']]
y = data['dcr_tae1']

In [16]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the summary
print(result.summary())


# none of the drug variables was a significant predictor for dcr_tae1

         Current function value: 0.421846
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               dcr_tae1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1001
Method:                           MLE   Df Model:                            5
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                0.001838
Time:                        20:43:36   Log-Likelihood:                -424.80
converged:                      False   LL-Null:                       -425.58
Covariance Type:            nonrobust   LLR p-value:                    0.9055
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.7042      0.101    -16.934      0.000      -1.901      -1.507
cocaine    -7.185e-05      0.000     -0.252      0.801      -0.00

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


In [18]:
result = model.fit(maxiter=100) #failed to converge....

         Current function value: 0.421846
         Iterations: 100


