Switch to the new true string dtype in Pandas version 1.x

Simplifies some of our string column handling and keeps us up-to-date with latest major version series. Requires ID3C version 2020.1, which requires Pandas 1.x.
seattleflu · Feb 11, 2020 · 22f2751 · 22f2751
1 parent d4a636a
commit 22f2751
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 184 deletions.
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/lib/seattleflu/id3c/cli/command/__init__.py b/lib/seattleflu/id3c/cli/command/__init__.py
@@ -42,24 +42,14 @@ def age_ceiling(age: float, max_age=85) -> float:
 
 def trim_whitespace(df: pd.DataFrame) -> pd.DataFrame:
     """ Trims leading and trailing whitespace from strings in *df* """
-    str_columns = df.columns[every_value_is_str_or_na(df)]
-
-    # Guard against AttributeErrors from entirely empty non-object dtype columns
-    str_columns = list(df[str_columns].select_dtypes(include='object'))
+    # Guard against AttributeErrors from entirely empty non-string dtype columns
+    str_columns: List[str] = list(df[str_columns].select_dtypes(include='string'))
 
     df[str_columns] = df[str_columns].apply(lambda column: column.str.strip())
 
     return df
 
 
-def every_value_is_str_or_na(df: pd.DataFrame):
-    """
-    Evaluates whether every value in the columns of a given DataFrame *df* is
-    either a string or NA.
-    """
-    return df.applymap(lambda col: isinstance(col, str) or pd.isna(col)).all()
-
-
 def barcode_quality_control(clinical_records: pd.DataFrame, output: str) -> None:
     """ Perform quality control on barcodes """
     missing_barcodes = missing_barcode(clinical_records)

diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py
@@ -63,7 +63,7 @@ def parse_uw(uw_filename, output):
 
     read_uw = partial(
         read,
-        dtype = {'tract_identifier': 'str'},
+        dtype = {'tract_identifier': 'string'},
         parse_dates = ['Collection.Date', 'LabDtTm'],
         na_values = ['NA', '', 'Unknown', 'NULL'],
     )
@@ -138,7 +138,7 @@ def create_unique_identifier(df: pd.DataFrame):
     #   -trs, 2 Dec 2019
 
     df['identifier'] = (df['labMRN'] + df['LabAccNum'] + \
-                        df['encountered'].astype(str)
+                        df['encountered'].astype('string')
                         ).str.lower()
     return df.drop_duplicates(subset="identifier")
 
@@ -189,7 +189,7 @@ def parse_sch(sch_filename, output):
     All clinical records parsed are output to stdout as newline-delimited JSON
     records.  You will likely want to redirect stdout to a file.
     """
-    dtypes = {'census_tract': 'str'}
+    dtypes = {'census_tract': 'string'}
     clinical_records = pd.read_csv(sch_filename, dtype=dtypes)
     clinical_records = trim_whitespace(clinical_records)
     clinical_records = add_provenance(clinical_records, sch_filename)
@@ -255,7 +255,7 @@ def create_encounter_identifier(df: pd.DataFrame) -> pd.DataFrame:
     modified DataFrame.
     """
     df["identifier"] = (
-        df["individual"] + df["encountered"].astype(str)
+        df["individual"] + df["encountered"].astype('string')
         ).str.lower()
 
     return df
@@ -329,7 +329,7 @@ def add_kp_manifest_data(df: pd.DataFrame, manifest_filename: str) -> pd.DataFra
     given clinical records DataFrame *df*
     """
     barcode = 'Barcode ID (Sample ID)'
-    dtypes = {barcode: str}
+    dtypes = {barcode: 'string'}
 
     manifest_data = pd.read_excel(manifest_filename, sheet_name='KP', dtype=dtypes)
 

diff --git a/lib/seattleflu/id3c/cli/command/longitudinal.py b/lib/seattleflu/id3c/cli/command/longitudinal.py
@@ -351,8 +351,8 @@ def create_identifiers(df: pd.DataFrame) -> pd.DataFrame:
     given DataFrame *df*.
     Contains some hard-coded values that may need to be updated in the future.
     """
-    df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('str')
-    df['identifier'] = df['individual'] + '/' + df['week'].astype('str')
+    df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('string')
+    df['identifier'] = df['individual'] + '/' + df['week'].astype('string')
 
     return df
 

diff --git a/locations/Snakefile b/locations/Snakefile
@@ -25,7 +25,7 @@ with open("data/omitted-states.txt", encoding = "UTF-8") as file:
 
 states = (
     pandas
-    .read_csv("data/states.txt", sep = "|", dtype = object)
+    .read_csv("data/states.txt", sep = "|", dtype = 'string')
     .rename(columns = {
         "STATE": "fips_code",
         "STUSAB": "usps_code",

diff --git a/setup.py b/setup.py
@@ -56,7 +56,7 @@
     python_requires = ">=3.6",
 
     install_requires = [
-        "id3c >=2019.1",
+        "id3c >=2020.1",
         "click >=7.0",
         "regex",
         "requests",
@@ -66,7 +66,7 @@
 
     extras_require = {
         "locations": [
-            "pandas",
+            "pandas >=1.0.1,<2",
             "snakemake",
         ],
     },