In [1]:
import pandas as pd

In [2]:
import csv
import sys
import argparse

In [3]:
def filter_csv_by_prefix(input_file: str, output_file: str, prefixes: list[str]) -> None:
    """
    Filters columns from a CSV file whose headers start with specific prefixes and writes them to a new CSV file.
    
    Parameters:
    ----------
    input_file: The path to the input CSV file.
    output_file: The path to the output CSV file where the filtered data will be saved.
    prefixes : A list of prefixes; columns whose headers start with any of these prefixes will be included.
    
    Returns:
    --------
    None
    """
    try:
        with open(input_file, mode='r', newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            
            matching_columns = [col for col in reader.fieldnames if any(col.startswith(prefix) for prefix in prefixes)]
            
            if not matching_columns:
                raise ValueError(f"No columns found starting with the provided prefixes: {prefixes}")
            
            with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
                writer = csv.DictWriter(outfile, fieldnames=matching_columns, extrasaction='ignore')
                
                writer.writeheader()
                
                for row in reader:
                    filtered_row = {col: row[col] for col in matching_columns}
                    writer.writerow(filtered_row)
        
        print(f"Filtered CSV written to {output_file} with columns: {matching_columns}")
    
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except ValueError as ve:
        print(f"Error: {ve}")




In [None]:
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Filter specific columns from a CSV file.")
#     parser.add_argument("input_file", help="Path to the input CSV file")
#     parser.add_argument("output_file", help="Path to the output CSV file")
#     parser.add_argument("codes", help="Comma-separated list of UKB codes to extract")
    
#     args = parser.parse_args()

#     prefixes = args.codes.split(",")
    
#     filter_csv_by_prefix(args.input_file, args.output_file, prefixes)

"sex": "22001"
"age at recruitment": "21022"


In [34]:
icd_df = pd.read_csv("./4041061_11_23.csv", nrows=10)

In [35]:
icd_df

Unnamed: 0,eid,3-0.0,3-1.0,3-2.0,3-3.0,4-0.0,4-1.0,4-2.0,4-3.0,5-0.0,...,105010-2.0,105010-3.0,105010-4.0,105030-0.0,105030-1.0,105030-2.0,105030-3.0,105030-4.0,110005-0.0,110006-0.0
0,1000014,430,,,,704.0,,,,196,...,,,,2010-02-27T11:58:32,,,,,0.0,2014-03-27
1,1000023,864,,369.0,,,,405.0,,222,...,,,,,,,,,2.0,2015-01-30
2,1000030,909,,,,794.0,,,,109,...,2011-07-05T07:53:12,,,2010-03-09T16:17:10,,2011-07-05T07:24:51,,,,
3,1000041,250,,,,512.0,,,,233,...,,,,,,,,,,
4,1000059,448,,,,458.0,,,,737,...,,,,,,,,,,
5,1000062,498,,,,1062.0,,,,684,...,,,,2009-10-07T17:39:25,,,,,,
6,1000077,305,,,,600.0,,,,287,...,,,,,,,,,0.0,2015-01-23
7,1000086,438,,442.0,437.0,364.0,,612.0,298.0,214,...,,,,,2011-03-07T14:25:13,,,,0.0,2014-05-23
8,1000095,985,,,,673.0,,,,284,...,2011-07-21T13:06:07,,,,2011-03-04T09:25:52,2011-07-21T12:43:16,,,2.0,2015-08-13
9,1000100,476,,,,485.0,,,,319,...,,2011-11-23T11:13:16,2012-05-27T21:34:02,,,,2011-11-23T11:01:09,2012-05-27T21:20:36,2.0,2013-07-15


In [37]:
diagnoses_columns = ["eid"] + [col for col in icd_df.columns if "41202" in col.lower()]

- Only grabbing the diagnoses columns

In [38]:
icd_df = pd.read_csv("./4041061_11_23.csv", usecols=diagnoses_columns)

  icd_df = pd.read_csv("./4041061_11_23.csv", usecols=diagnoses_columns)


In [39]:
icd_df['all_diagnoses'] = icd_df[diagnoses_columns].apply(lambda row: row.tolist(), axis=1)


In [40]:
icd_df["all_diagnoses"] = icd_df["all_diagnoses"].apply(lambda x: [diagnoses for diagnoses in x if isinstance(diagnoses, str)])

In [41]:
icd_df["all_diagnoses"] = icd_df["all_diagnoses"].apply(lambda x: [diagnoses for diagnoses in x if "f" in diagnoses.lower() or "g" in diagnoses.lower()])

In [46]:
icd_df["all_diagnoses"].head()

0    []
1    []
2    []
3    []
4    []
Name: all_diagnoses, dtype: object

In [47]:
icd_df["has_diagnosis"] = icd_df["all_diagnoses"].apply(lambda x: len(x)) >= 1

In [48]:
icd_df["has_diagnosis"]

0         False
1         False
2         False
3         False
4         False
          ...  
502304    False
502305    False
502306    False
502307    False
502308    False
Name: has_diagnosis, Length: 502309, dtype: bool

In [50]:
icd_df[icd_df["has_diagnosis"]]

Unnamed: 0,eid,41202-0.0,41202-0.1,41202-0.2,41202-0.3,41202-0.4,41202-0.5,41202-0.6,41202-0.7,41202-0.8,...,41202-0.72,41202-0.73,41202-0.74,41202-0.75,41202-0.76,41202-0.77,41202-0.78,41202-0.79,all_diagnoses,has_diagnosis
21,1000229,D125,D333,F103,G403,G409,G910,G919,G930,H020,...,,,,,,,,,"[F103, G403, G409, G910, G919, G930]",True
35,1000363,F450,I269,J929,N840,N920,R104,,,,...,,,,,,,,,[F450],True
53,1000544,F059,F209,F259,J690,K628,L290,N390,N394,N40,...,,,,,,,,,"[F059, F209, F259]",True
55,1000567,A419,C504,C509,F059,,,,,,...,,,,,,,,,[F059],True
59,1000604,G560,H269,I839,K409,R51,S4220,Z006,,,...,,,,,,,,,[G560],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502276,6025074,F171,J181,J440,J449,R074,R101,,,,...,,,,,,,,,[F171],True
502281,6025127,G122,G459,G839,I839,J690,K590,K805,K85,K918,...,,,,,,,,,"[G122, G459, G839]",True
502284,6025153,D252,G560,K358,M940,N029,N328,N832,R103,R104,...,,,,,,,,,[G560],True
502289,6025203,C502,C509,G431,,,,,,,...,,,,,,,,,[G431],True
