Skip to content
Permalink
Browse files

Merge branch 'upgrade-pandas'

  • Loading branch information
tsibley committed Feb 11, 2020
2 parents d4a636a + 22f2751 commit 0ee2b27f20ea76a027af1490f694c95fb11421bd

Large diffs are not rendered by default.

@@ -42,24 +42,14 @@ def age_ceiling(age: float, max_age=85) -> float:

def trim_whitespace(df: pd.DataFrame) -> pd.DataFrame:
""" Trims leading and trailing whitespace from strings in *df* """
str_columns = df.columns[every_value_is_str_or_na(df)]

# Guard against AttributeErrors from entirely empty non-object dtype columns
str_columns = list(df[str_columns].select_dtypes(include='object'))
# Guard against AttributeErrors from entirely empty non-string dtype columns
str_columns: List[str] = list(df[str_columns].select_dtypes(include='string'))

df[str_columns] = df[str_columns].apply(lambda column: column.str.strip())

return df


def every_value_is_str_or_na(df: pd.DataFrame):
"""
Evaluates whether every value in the columns of a given DataFrame *df* is
either a string or NA.
"""
return df.applymap(lambda col: isinstance(col, str) or pd.isna(col)).all()


def barcode_quality_control(clinical_records: pd.DataFrame, output: str) -> None:
""" Perform quality control on barcodes """
missing_barcodes = missing_barcode(clinical_records)
@@ -63,7 +63,7 @@ def parse_uw(uw_filename, output):

read_uw = partial(
read,
dtype = {'tract_identifier': 'str'},
dtype = {'tract_identifier': 'string'},
parse_dates = ['Collection.Date', 'LabDtTm'],
na_values = ['NA', '', 'Unknown', 'NULL'],
)
@@ -138,7 +138,7 @@ def create_unique_identifier(df: pd.DataFrame):
# -trs, 2 Dec 2019

df['identifier'] = (df['labMRN'] + df['LabAccNum'] + \
df['encountered'].astype(str)
df['encountered'].astype('string')
).str.lower()
return df.drop_duplicates(subset="identifier")

@@ -189,7 +189,7 @@ def parse_sch(sch_filename, output):
All clinical records parsed are output to stdout as newline-delimited JSON
records. You will likely want to redirect stdout to a file.
"""
dtypes = {'census_tract': 'str'}
dtypes = {'census_tract': 'string'}
clinical_records = pd.read_csv(sch_filename, dtype=dtypes)
clinical_records = trim_whitespace(clinical_records)
clinical_records = add_provenance(clinical_records, sch_filename)
@@ -255,7 +255,7 @@ def create_encounter_identifier(df: pd.DataFrame) -> pd.DataFrame:
modified DataFrame.
"""
df["identifier"] = (
df["individual"] + df["encountered"].astype(str)
df["individual"] + df["encountered"].astype('string')
).str.lower()

return df
@@ -329,7 +329,7 @@ def add_kp_manifest_data(df: pd.DataFrame, manifest_filename: str) -> pd.DataFra
given clinical records DataFrame *df*
"""
barcode = 'Barcode ID (Sample ID)'
dtypes = {barcode: str}
dtypes = {barcode: 'string'}

manifest_data = pd.read_excel(manifest_filename, sheet_name='KP', dtype=dtypes)

@@ -351,8 +351,8 @@ def create_identifiers(df: pd.DataFrame) -> pd.DataFrame:
given DataFrame *df*.
Contains some hard-coded values that may need to be updated in the future.
"""
df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('str')
df['identifier'] = df['individual'] + '/' + df['week'].astype('str')
df['individual'] = 'sch/year-1/childcare/' + df['study_id'].astype('string')
df['identifier'] = df['individual'] + '/' + df['week'].astype('string')

return df

@@ -25,7 +25,7 @@ with open("data/omitted-states.txt", encoding = "UTF-8") as file:

states = (
pandas
.read_csv("data/states.txt", sep = "|", dtype = object)
.read_csv("data/states.txt", sep = "|", dtype = 'string')
.rename(columns = {
"STATE": "fips_code",
"STUSAB": "usps_code",
@@ -56,7 +56,7 @@
python_requires = ">=3.6",

install_requires = [
"id3c >=2019.1",
"id3c >=2020.1",
"click >=7.0",
"regex",
"requests",
@@ -66,7 +66,7 @@

extras_require = {
"locations": [
"pandas",
"pandas >=1.0.1,<2",
"snakemake",
],
},

0 comments on commit 0ee2b27

Please sign in to comment.
You can’t perform that action at this time.