In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm

In [2]:

def process_file(file_path, text_columns_start_row=0):
    """
    Processes a text file to handle numeric columns and filter NaN values.
    
    Parameters:
    - file_path: Path to the text file.
    - text_columns_start_row: Row index from where to start processing text columns (default is 1).
    
    Returns:
    - recombined_df: A DataFrame with filtered numeric and text columns.
    """
    # Step 1: Load the data into a pandas DataFrame
    df = pd.read_csv(file_path, delimiter='\t', low_memory=False)
    
    # Step 2: Select the first two text columns
    df_first_two_columns = df.iloc[text_columns_start_row:, :2]  # First two columns
    
    # Step 3: Select the numeric columns (all except the first two and last two)
    df_numeric = df.iloc[text_columns_start_row:, 2:-2]  # Select numeric columns dynamically
    
    # Step 4: Apply to_numeric to convert non-numeric values to NaN. Exclude the description row and then add it back.
    first_row = df_numeric.iloc[0]
    rest_of_df = df_numeric.iloc[1:]
    df_numeric = rest_of_df.apply(pd.to_numeric, errors='coerce')
    df_numeric = pd.concat([pd.DataFrame([first_row]), df_numeric], ignore_index=True)

    # Step 5: Retain the last two text columns
    df_last_two_columns = df.iloc[text_columns_start_row:, -2:]  # Last two columns
    
    # Step 6: Filter rows with no NaN values in the numeric columns
    non_nan_filtered = df_numeric.dropna()
    
    # Step 7: Filter out columns that have all NaN values in the numeric subset

    # Function to check if all elements except the first one are NaN
    def should_drop(column):
        return column[1:].isna().all()

    # Identify columns to drop
    columns_to_drop = [col for col in df_numeric.columns if should_drop(df_numeric[col])]

    # Drop those columns
    non_nan_columns = df_numeric.drop(columns=columns_to_drop)
    
    # Step 8: Recombine the numeric subset, first two text columns, and last two text columns
    recombined_df = pd.concat([df_first_two_columns, non_nan_columns, df_last_two_columns], axis=1)

    # Step 9: Check that collection_title or promoted_subjectkey are both there, otherwise print a warning
    description_row = recombined_df.iloc[0]
    desc_contains_collection_title = description_row.apply(lambda x: 'collection_title' in str(x)).any()
    desc_contains_promoted_subjectkey = description_row.apply(lambda x: 'promoted_subjectkey' in str(x)).any()
    cols_contain_collection_title = 'collection_title' in recombined_df.columns
    cols_contain_promoted_subjectkey = 'promoted_subjectkey' in recombined_df.columns

    if not desc_contains_collection_title:
        print("Collection_title is missing from the column descriptions.")
    if not desc_contains_promoted_subjectkey:
        print("Promoted_subjectkey is missing from the column descriptions.")
    if not cols_contain_collection_title:
        print("Collection_title is missing from the column names.")
    if not cols_contain_promoted_subjectkey:
        print("Promoted_subjectkey is missing from the column names.")
    if desc_contains_collection_title and desc_contains_promoted_subjectkey and cols_contain_collection_title and cols_contain_promoted_subjectkey:
        return recombined_df

def process_all_files(root_folder, file_list, text_columns_start_row=0):
    """
    Processes all files in the list and returns the processed DataFrames.
    
    Parameters:
    - root_folder: Root folder containing all text files.
    - file_list: List of file names to process.
    - text_columns_start_row: Row index from where to start processing text columns (default is 1).
    
    Returns:
    - result_dict: Dictionary with file names as keys and processed DataFrames as values.
    """
    result_dict = {}
    
    for file_name in file_list:
        file_path = os.path.join(root_folder, file_name)
        print(f"Processing {file_name}...")
        
        # Process the file
        processed_df = process_file(file_path, text_columns_start_row)
        
        # Store the result in the dictionary
        if isinstance(processed_df, pd.DataFrame):
            result_dict[file_name] = processed_df
    
    return result_dict



In [3]:
#Get a list of all text files from the root folder

def get_text_files(directory):
    text_files = [f for f in os.listdir(directory) if f.endswith('.txt') and os.path.isfile(os.path.join(directory, f))]
    return text_files

root_folder = r"C:\Users\Senna\Desktop\Iigaya_lab\catie\catie_text_data\catie"
file_list = get_text_files(root_folder)
print(file_list)

['aesposys01.txt', 'aims01.txt', 'cata01.txt', 'cgis01.txt', 'clgry01.txt', 'dai01.txt', 'demo01.txt', 'dgsposys01.txt', 'dosecomp01.txt', 'ecg01.txt', 'endphase01.txt', 'endstudy01.txt', 'fint01.txt', 'hair01.txt', 'itaq01.txt', 'keyvars01.txt', 'lab01.txt', 'maccomp01.txt', 'macvlnce01.txt', 'med01.txt', 'meddispn01.txt', 'ndar_aggregate.txt', 'ndar_subject01.txt', 'neurobatt01.txt', 'package_info.txt', 'panss01.txt', 'qol01.txt', 'sae01.txt', 'scid_ph01.txt', 'screen01.txt', 'sf1201.txt', 'surf01.txt', 'surfq01.txt', 'timeto01.txt', 'viol01.txt', 'vitals01.txt']


In [4]:
# Process all files and store the results in a dictionary
processed_data = process_all_files(root_folder, file_list)
# processed_data['med01.txt']

Processing aesposys01.txt...
Processing aims01.txt...
Processing cata01.txt...
Processing cgis01.txt...
Processing clgry01.txt...
Processing dai01.txt...
Processing demo01.txt...
Processing dgsposys01.txt...
Processing dosecomp01.txt...
Processing ecg01.txt...
Processing endphase01.txt...
Processing endstudy01.txt...
Processing fint01.txt...
Processing hair01.txt...
Processing itaq01.txt...
Processing keyvars01.txt...
Processing lab01.txt...
Processing maccomp01.txt...
Processing macvlnce01.txt...
Processing med01.txt...
Processing meddispn01.txt...
Processing ndar_aggregate.txt...
Collection_title is missing from the column descriptions.
Promoted_subjectkey is missing from the column descriptions.
Collection_title is missing from the column names.
Promoted_subjectkey is missing from the column names.
Processing ndar_subject01.txt...
Processing neurobatt01.txt...
Processing package_info.txt...
Collection_title is missing from the column descriptions.
Promoted_subjectkey is missing from

In [5]:
#Merge all dfs which have promoted_subjectkey and a collection_title
#This is going to merge on all shared columns, doing outer to make sure all rows are kept. NaNs will go in where info does not exist for that row.

# Initial DataFrame (use the first DataFrame in the dictionary)
merged_df = list(processed_data.values())[0]

# Merge remaining DataFrames
for key in list(processed_data.keys())[1:]:
    merged_df = pd.merge(merged_df, processed_data[key], how='outer')

In [6]:
#other idea of what the column of medications for phase 1 is
#merged_df.loc['med2g']
merged_df['treat_1'].unique()

array(['Treatment for Phase 1', nan, 5.0, 1.0, 3.0, 2.0, 4.0],
      dtype=object)

In [None]:
# for each promoted subject key, get rows with non-NAN values of: treat_1, dcr_tae1, dcr_eff1
# with that promoted subject key as 'key', find the values for 'cocaine', 'opiates', 'pcp', 'meth', 'thc'.
# do logistic regression. 

In [86]:
# Step 1: Filter for non-NA 'treat_1' and get relevant 'promoted_subjectkey'
filtered_df = merged_df[merged_df['treat_1'].notna()]
promoted_keys = filtered_df['promoted_subjectkey'].unique()

In [87]:
# Step 2: Filter for the relevant 'promoted_subjectkey' and required columns
selected_df = merged_df[merged_df['promoted_subjectkey'].isin(promoted_keys)][[
    'promoted_subjectkey', 'dcr_tae1', 'dcr_eff1', 'cocaine', 'opiates', 'pcp', 'meth', 'thc'
]]

In [88]:
# Step 2b: Aggregate data by 'promoted_subjectkey'
aggregated_df = selected_df.groupby('promoted_subjectkey').agg({
    'dcr_tae1': 'first',  # take first non-NA occurrence of 'dcr_tae1'
    'dcr_eff1': 'first', 
    'cocaine': 'max',     # if any row has '1', this will capture it
    'opiates': 'max',
    'pcp': 'max',
    'meth': 'max',
    'thc': 'max'
}).dropna(subset=['dcr_tae1'])  # Ensure 'dcr_tae1' has values

In [89]:
aggregated_df

Unnamed: 0_level_0,dcr_tae1,dcr_eff1,cocaine,opiates,pcp,meth,thc
promoted_subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NDAR_INVAB125GA8,0,1,0.0,0.0,0.0,0.0,0.0
NDAR_INVAB164FVP,1,0,,,,,
NDAR_INVAB636BG0,0,0,0.0,0.0,0.0,0.0,
NDAR_INVAC199RGN,0,0,,,,,
NDAR_INVAC503FLF,0,0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
NDAR_INVZY901AJ9,0,0,0.0,0.0,0.0,154.5,1.0
NDAR_INVZZ168GJJ,1,0,0.0,0.0,0.0,0.0,0.0
NDAR_INVZZ241YP6,0,1,0.0,0.0,0.0,0.0,0.0
NDAR_INVZZ243NW0,0,0,,,,,


In [90]:
# List of columns to convert to numeric
drug_columns = ['cocaine', 'opiates', 'pcp', 'meth', 'thc', 'dcr_tae1', 'dcr_eff1']

for col in drug_columns:
    aggregated_df[col] = pd.to_numeric(aggregated_df[col], errors='coerce')

print(aggregated_df[drug_columns].dtypes)


cocaine     float64
opiates     float64
pcp         float64
meth        float64
thc         float64
dcr_tae1    float64
dcr_eff1    float64
dtype: object


In [91]:
aggregated_df['drugs'] = (aggregated_df[['cocaine', 'opiates', 'pcp', 'meth', 'thc']] > 0).any(axis=1).astype(int)
aggregated_df['drugs_sum'] = aggregated_df[['cocaine', 'opiates', 'pcp', 'meth', 'thc']].sum(axis=1)

In [98]:
X = aggregated_df[['cocaine', 'opiates', 'pcp', 'meth', 'thc', 'drugs', 'drugs_sum']]
y_se = pd.to_numeric(aggregated_df['dcr_tae1'], errors='coerce')
y_eff = pd.to_numeric(aggregated_df['dcr_eff1'], errors='coerce')

In [99]:
data = pd.concat([X, y_se,y_eff], axis=1).dropna()
data

Unnamed: 0_level_0,cocaine,opiates,pcp,meth,thc,drugs,drugs_sum,dcr_tae1,dcr_eff1
promoted_subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NDAR_INVAB125GA8,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0
NDAR_INVAC503FLF,0.0,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0
NDAR_INVAD098BRG,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
NDAR_INVAD465YGX,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
NDAR_INVAD713UKY,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
NDAR_INVZY740HGN,434.0,0.0,0.0,0.0,0.0,1,434.0,0.0,0.0
NDAR_INVZY888ZC7,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
NDAR_INVZY901AJ9,0.0,0.0,0.0,154.5,1.0,1,155.5,0.0,0.0
NDAR_INVZZ168GJJ,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0


In [100]:
X_each = data[['cocaine', 'opiates', 'pcp', 'meth', 'thc']]
X_drugs_bin = data[['drugs']]
X_drugs_sum = data[['drugs_sum']]
y_se = data['dcr_tae1']
y_eff = data['dcr_eff1']

In [101]:
print(X_each.isna().sum())  # Check for NaN values
print((X_each == float('inf')).sum())  # Check for inf values
print((X_each == float('-inf')).sum())  # Check for -inf values

cocaine    0
opiates    0
pcp        0
meth       0
thc        0
dtype: int64
cocaine    0
opiates    0
pcp        0
meth       0
thc        0
dtype: int64
cocaine    0
opiates    0
pcp        0
meth       0
thc        0
dtype: int64


In [102]:
X = sm.add_constant(X_each)
X

Unnamed: 0_level_0,const,cocaine,opiates,pcp,meth,thc
promoted_subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NDAR_INVAB125GA8,1.0,0.0,0.0,0.0,0.0,0.0
NDAR_INVAC503FLF,1.0,0.0,0.0,0.0,0.0,1.0
NDAR_INVAD098BRG,1.0,0.0,0.0,0.0,0.0,0.0
NDAR_INVAD465YGX,1.0,0.0,0.0,0.0,0.0,0.0
NDAR_INVAD713UKY,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
NDAR_INVZY740HGN,1.0,434.0,0.0,0.0,0.0,0.0
NDAR_INVZY888ZC7,1.0,0.0,0.0,0.0,0.0,0.0
NDAR_INVZY901AJ9,1.0,0.0,0.0,0.0,154.5,1.0
NDAR_INVZZ168GJJ,1.0,0.0,0.0,0.0,0.0,0.0


In [103]:
# Logistic regression: each drugs vs dcr_tae1
# Add constant term for intercept


model = sm.Logit(y_se, X)
result = model.fit()

print(result.summary())

         Current function value: 0.421846
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               dcr_tae1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1001
Method:                           MLE   Df Model:                            5
Date:                Tue, 19 Nov 2024   Pseudo R-squ.:                0.001838
Time:                        13:29:12   Log-Likelihood:                -424.80
converged:                      False   LL-Null:                       -425.58
Covariance Type:            nonrobust   LLR p-value:                    0.9055
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.7042      0.101    -16.934      0.000      -1.901      -1.507
cocaine    -7.185e-05      0.000     -0.252      0.801      -0.00

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


In [18]:
result = model.fit(maxiter=100) #failed to converge....

         Current function value: 0.421846
         Iterations: 100




In [106]:
# Logistic regression: drugs binary vs dcr_tae1
# Add constant term for intercept
X = sm.add_constant(X_drugs_bin)

model = sm.Logit(y_se, X)
result = model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.421129
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               dcr_tae1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1005
Method:                           MLE   Df Model:                            1
Date:                Tue, 19 Nov 2024   Pseudo R-squ.:                0.003534
Time:                        13:30:03   Log-Likelihood:                -424.08
converged:                       True   LL-Null:                       -425.58
Covariance Type:            nonrobust   LLR p-value:                   0.08283
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.6192      0.108    -14.940      0.000      -1.832      -1.407
drugs         -0.3209      0.

In [108]:
# Logistic regression: drugs binary vs dcr_eff1
# Add constant term for intercept
model = sm.Logit(y_eff, X)
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.554775
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               dcr_eff1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1005
Method:                           MLE   Df Model:                            1
Date:                Tue, 19 Nov 2024   Pseudo R-squ.:                0.002158
Time:                        13:30:52   Log-Likelihood:                -558.66
converged:                       True   LL-Null:                       -559.87
Covariance Type:            nonrobust   LLR p-value:                    0.1201
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0411      0.092    -11.356      0.000      -1.221      -0.861
drugs         -0.2366      0.

In [113]:
# Logistic regression: drugs sum vs dcr_tae1
X = sm.add_constant(X_drugs_sum)

model = sm.Logit(y_se, X)
result = model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.422562
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               dcr_tae1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1005
Method:                           MLE   Df Model:                            1
Date:                Tue, 19 Nov 2024   Pseudo R-squ.:               0.0001435
Time:                        13:31:56   Log-Likelihood:                -425.52
converged:                       True   LL-Null:                       -425.58
Covariance Type:            nonrobust   LLR p-value:                    0.7268
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.7284      0.090    -19.171      0.000      -1.905      -1.552
drugs_sum  -9.623e-05      0.

In [114]:
# Logistic regression: drugs sum vs dcr_eff1

model = sm.Logit(y_eff, X)
result = model.fit()

print(result.summary())


Optimization terminated successfully.
         Current function value: 0.555378
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               dcr_eff1   No. Observations:                 1007
Model:                          Logit   Df Residuals:                     1005
Method:                           MLE   Df Model:                            1
Date:                Tue, 19 Nov 2024   Pseudo R-squ.:                0.001073
Time:                        13:32:08   Log-Likelihood:                -559.27
converged:                       True   LL-Null:                       -559.87
Covariance Type:            nonrobust   LLR p-value:                    0.2730
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1118      0.075    -14.828      0.000      -1.259      -0.965
drugs_sum     -0.0003      0.