In [1]:
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go

In [2]:
path = "/Users/sivaguganjayachandran/PycharmProjects/corewell_health/siva/data/data.csv"
df = pd.read_csv(path)

In [3]:
import pandas as pd
import numpy as np

# --- Normalize orgs ---
def normalize_org(text: str) -> str:
    txt = str(text).upper()
    if "SPECTRUM" in txt:
        return "Spectrum"
    if "BEAUMONT" in txt or "BEAUMON" in txt:
        return "Beaumont"
    if "COREWELL" in txt:
        return "Corewell"
    return None  # ignore others

# --- Build grant key ---
def make_grant_key(r):
    if pd.notnull(r.get("id")):
        try:
            return f"id:{int(r['id'])}"
        except Exception:
            pass
    return f"title:{r.get('title','')[:150]}|start:{r.get('startDate')}|exp:{r.get('expDate')}|funds:{r.get('fundsObligatedAmt')}"

# --- Main logic ---
def add_org_flags(df: pd.DataFrame) -> pd.DataFrame:
    # ensure dates are parsed
    for c in ["date","startDate","expDate"]:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    df["grant_key"] = df.apply(make_grant_key, axis=1)

    # find org for each row (search orgCandidate, awardeeName, title)
    df["_org"] = df.apply(
        lambda r: normalize_org(
            f"{r.get('orgCandidate','')} {r.get('awardeeName','')} {r.get('title','')}"
        ),
        axis=1
    )

    # collect unique orgs per grant
    combo = (
        df.groupby("grant_key")["_org"]
          .apply(lambda s: sorted([o for o in set(s) if o]))  # keep only Corewell/Spectrum/Beaumont
          .reset_index()
    )

    def list_to_flag(orgs):
        return " & ".join(orgs) if orgs else "Other"

    combo["org_combo_flag"] = combo["_org"].apply(list_to_flag)
    combo["org_count_flag"] = combo["_org"].apply(len)
    combo["is_multi_org_flag"] = combo["org_count_flag"] > 1
    combo = combo.drop(columns=["_org"])

    # merge flags back
    df = df.merge(combo, on="grant_key", how="left")

    # deduplicate: one row per grant_key
    df_simple = (
        df.sort_values("grant_key")
          .groupby("grant_key", as_index=False)
          .first()
    )

    return df_simple

# --- Usage ---
# df = pd.read_csv("yourfile.csv")   # or however you read it
df_simple = add_org_flags(df)

# Quick check
df_simple[["grant_key","org_combo_flag","is_multi_org_flag","org_count_flag"]].head()

Unnamed: 0,grant_key,org_combo_flag,is_multi_org_flag,org_count_flag
0,id:1002878,Beaumont & Corewell & Spectrum,True,3
1,id:1004847,Beaumont & Corewell & Spectrum,True,3
2,id:1006950,Beaumont & Corewell & Spectrum,True,3
3,id:1007261,Beaumont & Corewell & Spectrum,True,3
4,id:1009244,Beaumont & Corewell & Spectrum,True,3


In [4]:
df_simple

Unnamed: 0.1,grant_key,Unnamed: 0,id,agency,awardeeName,awardeeCity,awardeeStateCode,piFirstName,piLastName,title,date,startDate,expDate,fundsObligatedAmt,abstractText,orgCandidate,_org,org_combo_flag,org_count_flag,is_multi_org_flag
0,id:1002878,1396,1002878,NSF,University of Connecticut Health Center,FARMINGTON,CT,Raquell,Holmes,Pilot: Improvisational Theater for Computing ...,2010-09-08,2010-09-01,2012-08-31,98058,Improvisational Theater for Computing Scientis...,Beaumont Health,Beaumont,Beaumont & Corewell & Spectrum,3,True
1,id:1004847,907,1004847,NSF,University of North Texas Health Science Cente...,FORT WORTH,TX,Rita,Patterson,2010 Summer Bioengineering Conference: June 16...,2010-04-12,2010-04-15,2011-03-31,20000,1004847 Patterson The Summer Bioengineerin...,Spectrum Health,Spectrum,Beaumont & Corewell & Spectrum,3,True
2,id:1006950,403,1006950,NSF,University of New Mexico Health Sciences Center,ALBUQUERQUE,NM,Stephanie,Ruby,EAGER: Shifting the Spliceosome's Gears,2010-01-04,2010-02-01,2011-01-31,30000,Intellectual merit. Spliceosomal RNAs are thou...,Corewell Health,Corewell,Beaumont & Corewell & Spectrum,3,True
3,id:1007261,375,1007261,NSF,Health Research Incorporated/New York State De...,MENANDS,NY,Liaquat,Husain,Collaborative Research: Sources of Black Carb...,2010-09-16,2010-10-01,2015-03-31,273917,Particles are important in the global climate ...,Corewell Health,Corewell,Beaumont & Corewell & Spectrum,3,True
4,id:1009244,1386,1009244,NSF,Florida Department of Health Division of Infor...,TALLAHASSEE,FL,Kendra,Goff,CNH: Collaborative Research: Modeling the Dyna...,2010-09-29,2010-10-01,2015-09-30,143032,"Project Abstract Around the world, harmful al...",Beaumont Health,Beaumont,Beaumont & Corewell & Spectrum,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,id:964517,412,964517,NSF,Louisiana State University Health Sciences Center,NEW ORLEANS,LA,Siqiong,Liu,Physiological Consequences of Glutamate Recept...,2010-05-04,2009-09-01,2011-08-31,55168,The long-term goal of this project is to revea...,Corewell Health,Corewell,Beaumont & Corewell & Spectrum,3,True
509,id:964613,1392,964613,NSF,The University of Texas Health Science Center ...,HOUSTON,TX,Elmer,Bernstam,III: Medium: Collaborative Research: Data Mini...,2010-09-07,2010-09-01,2015-08-31,599607,A clinical data warehouse (CDW) is a repositor...,Beaumont Health,Beaumont,Beaumont & Corewell & Spectrum,3,True
510,id:964728,912,964728,NSF,University of Toledo Health Science Campus,TOLEDO,OH,Robert,Blumenthal,Regulation of Type II Restriction-Modification...,2010-03-03,2010-03-01,2013-08-31,569999,Intellectual Merit: The biosphere is dominate...,Spectrum Health,Spectrum,Beaumont & Corewell & Spectrum,3,True
511,id:966482,939,966482,NSF,"Loyola University of Chicago, Health Sciences ...",MAYWOOD,IL,Charles,"Webber, Jr",DHB COLLABORATIVE: Lifespan Dynamics of Cognit...,2009-12-04,2009-01-11,2012-12-31,38132,How well can you walk and talk at the same tim...,Spectrum Health,Spectrum,Beaumont & Corewell & Spectrum,3,True


In [6]:
df_simple.to_csv("cleaned_data.csv")