In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import datetime

fake = Faker()
random.seed(42)

# Parameters
n_hcps = 1000        # HCP master table size
n_activities = 5000  # Activities table size

# Helper lists
designations = [("MD","Medical Doctor"),("DO","Doctor of Osteopathy"),
                ("NP","Nurse Practitioner"),("PA","Physician Assistant")]
specialties = ["Internal Medicine","Rheumatology","Pulmonary Diseases",
               "Gastroenterology","Oncology","Hospitalist",
               "Family Medicine","Infectious Disease","Nurse Practitioner"]
states = ["NY","CA","TX","FL","AZ","IL","NJ","LA","MI","OH"]

# --------- HCP MASTER ---------
hcp_master = []
for i in range(1, n_hcps+1):
    pres_eid = 10000000 + i
    first, last = fake.first_name(), fake.last_name()
    gender = random.choice(["M","F"])
    age = random.randint(30,80)
    grad_year = random.randint(1970,2021)
    exp = 2025 - grad_year
    desig, desig_desc = random.choice(designations)
    state = random.choice(states)
    specialty = random.choice(specialties)

    hcp_master.append({
        "pres_eid": pres_eid,
        "medical_education_number": random.randint(1000000000,9999999999),
        "state_license_number": f"{state}{random.randint(10000,99999)}",
        "medical_school_name": fake.company() + " School of Medicine",
        "graduation_year": grad_year,
        "years_of_experience": exp,
        "npi_number": random.randint(1000000000,9999999999),
        "first_name": first,
        "middle_name": fake.first_name()[0],
        "last_name": last,
        "email": f"{first.lower()}{last.lower()}@health.org",
        "marketing_email": f"{first.lower()}.{last.lower()}@promo.com",
        "phone_number": fake.numerify("##########"),
        "gender": gender,
        "age": age,
        "individual_type": "PRES",
        "professional_designation": desig,
        "professional_designation_description": desig_desc,
        "primary_address_1": fake.street_address(),
        "primary_address_2": "",
        "primary_address_3": "",
        "primary_address_4": "",
        "address_type": "BUSINESS",
        "city": fake.city(),
        "country": "US",
        "county": fake.city_suffix(),
        "state": state,
        "zip": fake.zipcode(),
        "primary_specialty": specialty,
        "secondary_specialty": random.choice([random.choice(specialties),""]),
        "kaiser_flag": random.choice(["Y","N"]),
        "ama_flag": random.choice(["Y","N"]),
        "pdrp_flag": random.choice(["Y","N"]),
        "kol_flag": random.choice(["Y","N"]),
        "telemedicine_flg": random.choice(["Y","N"]),
        "new_tro_writer": random.choice(["Y","N"]),
        "onc_universe_flag": random.choice(["Y","N"]),
        "hiv_universe_flag": random.choice(["Y","N"]),
        "pres_status": random.choice(["ACTIVE","INACTIVE"]),
        "pres_status_reason_code": random.choice(["","RETR","CNTLOC"]),
        "pres_status_reason_description": random.choice(["","Retired","Location Unknown"]),
        "total_referrals_received": round(random.uniform(0,100),1),
        "total_referrals_sent": round(random.uniform(0,200),1),
        "social_media_mentions": random.randint(0,50),
        "citations_per_publication": round(random.uniform(0,30),1),
        "total_payments": round(random.uniform(0,10000),2),
        "total_works": random.randint(0,20),
        "total_clinical_trials": random.randint(0,5),
        "total_recent_publications": random.randint(0,5),
        "total_congresses": random.randint(0,10),
        "no_of_patients": random.randint(0,2000),
        "ety_super_clas": "HCP",
        "do_not_call_vod__c": random.choice(["true","false"]),
        "persondonotcall": random.choice(["true","false"]),
        "email_optout_flg": random.choice(["true","false"]),
        "last_activity_date": fake.date_between(start_date="-3y", end_date="today"),
        "attribute_1_name": "HBV",
        "attribute_1_value": f"HBV: {random.randint(0,10)}",
        "attribute_2_name": "HCV",
        "attribute_2_value": f"HCV: {random.randint(0,10)}",
        "attribute_3_name": "PBC",
        "attribute_3_value": f"PBC: {random.randint(0,10)}",
        "attribute_4_name": "Note",
        "attribute_4_value": fake.word(),
        "attribute_5_name": "Tag",
        "attribute_5_value": fake.word(),
        "hbv_universe_flag": random.choice(["Y","N"]),
        "hcv_universe_flag": random.choice(["Y","N"]),
        "pbc_universe_flag": random.choice(["Y","N"]),
        "no_of_patients_hbv": random.randint(0,200),
        "no_of_patients_hcv": random.randint(0,200),
        "no_of_patients_pbc": random.randint(0,200),
        "sys_run_dt": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "execution_date": datetime.now().strftime("%Y%m%d"),
        "execution_cycle_id": fake.uuid4()
    })

hcp_master_df = pd.DataFrame(hcp_master)

# --------- ACTIVITY DATA ---------
activity = []
for i in range(n_activities):
    pres_eid = random.choice(hcp_master_df["pres_eid"])
    product = random.choice(["VEKLURY","BIKTARVY","TRODELVY"])
    ta = random.choice(["HIV","ONC","VKY"])
    territory = fake.city() + " - " + random.choice(states) + str(random.randint(100,999))

    activity.append({
        "pres_eid": pres_eid,
        "ta": ta,
        "product_name": product,
        "ndc_code": fake.ean(length=13),
        "indication": random.choice(["mTNBC","mUC","COVID","HBV","HCV"]),
        "territory_id": fake.bothify(text="???###"),
        "sales_force_name": ta+"TS",
        "channel": random.choice(["CALLS","Zoom","Email","Telephone"]),
        "transaction_id": fake.uuid4(),
        "transaction_datetime": fake.date_time_this_decade(),
        "territory_name": territory,
        "rep_id": fake.uuid4(),
        "rep_name": fake.name(),
        "call_type": random.choice(["In-Person","Zoom","Telephone"]),
        "call_purpose": random.choice(["Detail Only","Group Detail","Follow Up"]),
        "parent_call": fake.uuid4(),
        "clm_flag": random.choice([True,False]),
        "time_spent": round(random.uniform(5,60),1),
        "status_vod__c": random.choice(["Submitted_vod","Delivered_vod","Opened_vod"]),
        "calls_excl_flg": random.choice(["Y","N"]),
        "sample_name": product if random.random()<0.5 else "",
        "ndc_name": product+" Sample",
        "sample_count": random.randint(0,20),
        "email_status": random.choice(["Delivered_vod","Opened_vod","Failed_vod"]),
        "email_sent": random.randint(0,5),
        "email_opened": random.randint(0,5),
        "email_last_open_date": fake.date_this_year(),
        "email_clicked": random.randint(0,5),
        "email_last_clicked_date": fake.date_this_year(),
        "last_device": random.choice(["iPad","Browser","iPhone"]),
        "email_subject": fake.sentence(),
        "valid_consent_exist": random.choice(["TRUE","FALSE"]),
        "document_description": fake.bs(),
        "action": random.choice(["OPENED_VOD","SUBMITTED_VOD","CLICKED_VOD"]),
        "click_url_vod__c": fake.url(),
        "event_msg_vod__c": fake.sentence(),
        "device_type_vod__c": random.choice(["iPad_vod","Laptop","Tablet"]),
        "gilead_asset_id": fake.uuid4(),
        "gilead_nba_suggestion_id": fake.uuid4(),
        "campaign_type": random.choice(["NBA","Non-NBA"]),
        "data_src_nm": random.choice(["Veeva CRM","EmailSys","CallCenter"]),
        "form_strg_cd": fake.lexify("FORM????"),
        "bus_mth_strt_dt": fake.date_this_year(),
        "bus_wk_end_dt": fake.date_this_year(),
        "attribute_1_name": "Note",
        "attribute_1_value": fake.word(),
        "attribute_2_name": "Category",
        "attribute_2_value": fake.word(),
        "attribute_3_name": "Score",
        "attribute_3_value": str(random.randint(1,100)),
        "attribute_4_name": "Risk",
        "attribute_4_value": random.choice(["Low","Medium","High"]),
        "attribute_5_name": "Remark",
        "attribute_5_value": fake.word(),
        "current_terr_id": fake.bothify("???###"),
        "sys_run_dt": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "execution_date": datetime.now().strftime("%Y%m%d"),
        "execution_cycle_id": fake.uuid4()
    })

activity_df = pd.DataFrame(activity)

# Save
hcp_master_df.to_csv("hcp_master_full.csv", index=False)
activity_df.to_csv("hcp_activity_full.csv", index=False)
