Skip to content

Commit

Permalink
Switch to the new true string dtype in Pandas version 1.x
Browse files Browse the repository at this point in the history
Simplifies some of our string column handling and keeps us up-to-date
with latest major version series.

Requires ID3C version 2020.1, which requires Pandas 1.x.
  • Loading branch information
tsibley committed Feb 11, 2020
1 parent d4a636a commit 22f2751
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 184 deletions.
370 changes: 208 additions & 162 deletions Pipfile.lock

Large diffs are not rendered by default.

14 changes: 2 additions & 12 deletions lib/seattleflu/id3c/cli/command/__init__.py
Expand Up @@ -42,24 +42,14 @@ def age_ceiling(age: float, max_age=85) -> float:

def trim_whitespace(df: pd.DataFrame) -> pd.DataFrame:
""" Trims leading and trailing whitespace from strings in *df* """
str_columns = df.columns[every_value_is_str_or_na(df)]

# Guard against AttributeErrors from entirely empty non-object dtype columns
str_columns = list(df[str_columns].select_dtypes(include='object'))
# Guard against AttributeErrors from entirely empty non-string dtype columns
str_columns: List[str] = list(df[str_columns].select_dtypes(include='string'))

df[str_columns] = df[str_columns].apply(lambda column: column.str.strip())

return df


def every_value_is_str_or_na(df: pd.DataFrame):
"""
Evaluates whether every value in the columns of a given DataFrame *df* is
either a string or NA.
"""
return df.applymap(lambda col: isinstance(col, str) or pd.isna(col)).all()


def barcode_quality_control(clinical_records: pd.DataFrame, output: str) -> None:
""" Perform quality control on barcodes """
missing_barcodes = missing_barcode(clinical_records)
Expand Down
10 changes: 5 additions & 5 deletions lib/seattleflu/id3c/cli/command/clinical.py
Expand Up @@ -63,7 +63,7 @@ def parse_uw(uw_filename, output):

read_uw = partial(
read,
dtype = {'tract_identifier': 'str'},
dtype = {'tract_identifier': 'string'},
parse_dates = ['Collection.Date', 'LabDtTm'],
na_values = ['NA', '', 'Unknown', 'NULL'],
)
Expand Down Expand Up @@ -138,7 +138,7 @@ def create_unique_identifier(df: pd.DataFrame):
# -trs, 2 Dec 2019

df['identifier'] = (df['labMRN'] + df['LabAccNum'] + \
df['encountered'].astype(str)
df['encountered'].astype('string')
).str.lower()
return df.drop_duplicates(subset="identifier")

Expand Down Expand Up @@ -189,7 +189,7 @@ def parse_sch(sch_filename, output):
All clinical records parsed are output to stdout as newline-delimited JSON
records. You will likely want to redirect stdout to a file.
"""
dtypes = {'census_tract': 'str'}
dtypes = {'census_tract': 'string'}
clinical_records = pd.read_csv(sch_filename, dtype=dtypes)
clinical_records = trim_whitespace(clinical_records)
clinical_records = add_provenance(clinical_records, sch_filename)
Expand Down Expand Up @@ -255,7 +255,7 @@ def create_encounter_identifier(df: pd.DataFrame) -> pd.DataFrame:
modified DataFrame.
"""
df["identifier"] = (
df["individual"] + df["encountered"].astype(str)
df["individual"] + df["encountered"].astype('string')
).str.lower()

return df
Expand Down Expand Up @@ -329,7 +329,7 @@ def add_kp_manifest_data(df: pd.DataFrame, manifest_filename: str) -> pd.DataFra
given clinical records DataFrame *df*
"""
barcode = 'Barcode ID (Sample ID)'
dtypes = {barcode: str}
dtypes = {barcode: 'string'}

manifest_data = pd.read_excel(manifest_filename, sheet_name='KP', dtype=dtypes)

Expand Down
4 changes: 2 additions & 2 deletions lib/seattleflu/id3c/cli/command/longitudinal.py
Expand Up @@ -351,8 +351,8 @@ def create_identifiers(df: pd.DataFrame) -> pd.DataFrame:
given DataFrame *df*.
Contains some hard-coded values that may need to be updated in the future.
"""
df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('str')
df['identifier'] = df['individual'] + '/' + df['week'].astype('str')
df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('string')
df['identifier'] = df['individual'] + '/' + df['week'].astype('string')

return df

Expand Down
2 changes: 1 addition & 1 deletion locations/Snakefile
Expand Up @@ -25,7 +25,7 @@ with open("data/omitted-states.txt", encoding = "UTF-8") as file:

states = (
pandas
.read_csv("data/states.txt", sep = "|", dtype = object)
.read_csv("data/states.txt", sep = "|", dtype = 'string')
.rename(columns = {
"STATE": "fips_code",
"STUSAB": "usps_code",
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -56,7 +56,7 @@
python_requires = ">=3.6",

install_requires = [
"id3c >=2019.1",
"id3c >=2020.1",
"click >=7.0",
"regex",
"requests",
Expand All @@ -66,7 +66,7 @@

extras_require = {
"locations": [
"pandas",
"pandas >=1.0.1,<2",
"snakemake",
],
},
Expand Down

0 comments on commit 22f2751

Please sign in to comment.