In [1]:
# import libraries
import pandas as pd
import os

In [2]:
# load all datasets
data_paths = {
    "experienced_sign": "experienced_sign.csv",
    "goals": "goals.csv",
    "head_hit_location": "head_hit_location.csv",
    "immediate_symptoms_resulting": "immediate_symptoms_resulting.csv",
    "incident_head_hit_location": "incident_head_hit_location.csv",
    "new_resulting_factors": "new_resulting_factors_utf8.csv",
    "nontbi_condition": "nontbi_condition.csv",
    "patient_info": "patient_info.csv",
    "patient_therapies": "patient_therapies.csv",
    "patient_types": "patient_types.csv",
    "register_tracking_list": "register_tracking_list.csv",
    "registered_factors": "registered_factors.csv",
    "registered_sdoh": "registered_sdoh.csv",
    "session": "session.csv",
    "symptom_details": "symptom_details.csv",
    "symptom_list": "symptom_list.csv",
    "tbi_from": "tbi_from.csv",
    "tbi_incident": "tbi_incident.csv",
    "therapies_list": "therapies_list.csv",
    "tracking_list": "tracking_list.csv",
    "user_info": "user_info.csv"
}

In [3]:
# load all dataframes
dataframes = {}
for name, path in data_paths.items():
    if os.path.exists(path):
        try:
            dataframes[name] = pd.read_csv(path, error_bad_lines=False, warn_bad_lines=True)
        except Exception as e:
            print(f"Error reading {path}: {e}")
    else:
        print(f"Warning: File {path} not found.")



In [4]:
# initial inspection
for name, df in dataframes.items():
    if isinstance(df, pd.DataFrame):
        print(f'{name}: shape = {df.shape}')
        print(f'Columns: {list(df.columns)}\n')

In [5]:
def fill_missing(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna(-1)
    return df

for name in dataframes:
    if isinstance(dataframes[name], pd.DataFrame):
        dataframes[name] = fill_missing(dataframes[name])


In [6]:
# merge datasets
merged_df = dataframes.get('patient_info')

if merged_df is not None:
    merge_candidates = [
        ('patient_types', 'patient_id'),
        ('patient_therapies', 'patient_id'),
        ('tbi_incident', 'patient_id'),
        ('new_resulting_factors', 'patient_id'),
        ('nontbi_condition', 'patient_id')
    ]

    for table_name, key in merge_candidates:
        if table_name in dataframes and key in dataframes[table_name].columns:
            merged_df = merged_df.merge(dataframes[table_name], on=key, how='left')


In [7]:
# merge user_infor if user_id exists
if 'user_info' in dataframes and 'user_id' in merged_df.columns and 'user_id' in dataframes['user_info'].columns:
    merged_df = merged_df.merge(dataframes['user_info'], on='user_id', how='left')

In [11]:
# final cleaning
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

AttributeError: 'NoneType' object has no attribute 'loc'

In [10]:
# quick overview of final merged table
print('final merged dataset shape:', merged_df.shape)
print('columns:\n', list(merged_df.columns))

AttributeError: 'NoneType' object has no attribute 'shape'

In [43]:
# preview the first few rows
if merged_df is not None:
    print(merged_df.head())
else:
    print("Error: patient_info dataset not found.")

                             patient_id first_name_x last_name_x  \
0  5c96ba1a-8b2d-49bc-8e8e-b07761948286       Robin       Lopez    
1  eda39327-b38f-41de-a46a-8782787369b7       Justin       Macks   
2  6a7f7ce9-f63e-4651-b941-a25ce116de74       sandra     kinstle   
3  096d402a-5fcd-41dd-b59b-b3089cf06742         Jing          Gu   
4  1715cc3c-bea6-4d7e-aad3-b2f96338ba60      Bootsie     Brenner   

  date_of_birth_x gender_x patient_type external_id    patient_sub_type  id_x  \
0      1973-05-23   female    caregiver      Unkown              Unkown   NaN   
1      1991-07-10     male    caregiver      Unkown              Unkown   NaN   
2      1955-02-15   female    caregiver      Unkown              Unkown   NaN   
3      1971-12-24     male       Stroke      Unkown  PreviousStrokeUser   NaN   
4      1958-01-28   female       Stroke      Unkown  PreviousStrokeUser   NaN   

  user_id  ...  country referral_group veteran ethnicity race city state  \
0     NaN  ...      NaN     

In [45]:
# save the merged dataframe to a CSV file
merged_df.to_csv('merged_patient_data.csv', index=False)