In [14]:
import pandas as pd

In [15]:
suffixes_to_remove = ["?fbclid=", "+&", "?_x_tr_", "?back="]
search_cache_identifier = "/search?q=cache:"
sacommunity_url = "https://sacommunity.org"

def clean_landing_page_column(text: str) -> str:
    if search_cache_identifier in text:
        text = text[text.index(sacommunity_url):].replace(sacommunity_url, "")

    for suffix_to_remove in suffixes_to_remove:
        if suffix_to_remove in text:
            text = text[:text.index(suffix_to_remove)]

    # remove underscore
    text = text.replace("_", " ")
    # remove /org/
    text = text.replace("/org/", "")

    return text.strip()

def get_organization_id(text: str) -> str:
    if "-" in text:
        return int(text[:text.index("-")])
    else:
        return None
    
def get_organization_name(text: str) -> str:
    if "-" in text:
        return text[text.index("-") + 1:]
    else:
        return None

# test texts
inputs = [
    "/org/196236-Dave's_Angels_Playgroup?fbclid=IwAR05WAQ0z5mwY7v1UEVmkDITFg7sDh8pcD8taJ3oGH4336EpkNZeP81BIKc",
    "/search?q=cache:UTs_a-1ZNgEJ:https://sacommunity.org/org/196341-Neighbourhood_Watch_-_Linden_Park_249+&cd=63&hl=en&ct=clnk&gl=bj",
    "/org/201669-Gifted_&_Talented_Children's_Association_of_SA_Inc.?_x_tr_sl=en&_x_tr_tl=th&_x_tr_hl=th&_x_tr_pto=sc",
    "/org/201830-Aged_Rights_Advocacy_Service_Inc.?back=https://www.google.com/search?client=safari&as_qdr=all&as_occt=any&safe=active&as_q=Age+advocate+for+South+Australia&channel=aplab&source=a-app1&hl=en",
    "/org/201950-SA_Ambulance_Service?_x_tr_sl=en&_x_tr_tl=fr&_x_tr_hl=fr&_x_tr_pto=nui,sc"
]

for input in inputs:
    print(clean_landing_page_column(input))
    

196236-Dave's Angels Playgroup
196341-Neighbourhood Watch - Linden Park 249
201669-Gifted & Talented Children's Association of SA Inc.
201830-Aged Rights Advocacy Service Inc.
201950-SA Ambulance Service


In [16]:
def clean_and_extract_organization(df_ga_orig: pd.DataFrame) -> pd.DataFrame:
    df_ga = df_ga_orig.dropna().copy()
    df_ga['organization_id_name'] = df_ga['Landing Page'].apply(clean_landing_page_column)
    df_ga['organization_id'] = df_ga['organization_id_name'].apply(get_organization_id)
    df_ga['organization_name'] = df_ga['organization_id_name'].apply(get_organization_name)
    return df_ga[["Landing Page", "organization_id_name","organization_id","organization_name", "Sessions"]]

In [29]:
def get_combined_data(df_org_id_and_session, df_sacommunity, df_google_analytics):
    results = []
    for index, row in df_org_id_and_session.iterrows():
        org_id = index
        
        session_count = row["Sessions"]
        
        # organization name from sa-community file
        org_names_sa_community = df_sacommunity[df_sacommunity['ID_19'] == org_id]["Org_name"].values
        organization_name_sa_community = ''
        is_record_available_in_sacommunity_db = False
        if len(org_names_sa_community) > 0:
            organization_name_sa_community = org_names_sa_community[0]
            is_record_available_in_sacommunity_db = True
       
        # organization name from google analytics file
        org_names_google = df_google_analytics[df_google_analytics["organization_id"] == org_id]["organization_name"].values
        organization_name_google = ''
        if len(org_names_google) > 0:
            organization_name_google = org_names_google[0]

        results.append({
            'org_id': org_id,
            'sessions_count': session_count,
            'organization_name_sa_community': organization_name_sa_community,
            'organization_name_google': organization_name_google,
            'is_record_available_in_sacommunity_db': is_record_available_in_sacommunity_db,
        })

    return pd.DataFrame(results)

In [30]:
def data_preprocessing(landing_page_file_path, sa_community_data_file_path):
    df_google_analytics = pd.read_excel(landing_page_file_path, sheet_name='Dataset1')
    df_google_analytics_cleaned = clean_and_extract_organization(df_google_analytics)
    df_grp_org_id = df_google_analytics_cleaned.groupby(by=['organization_id']).sum("Sessions")

    df_sacommunity_data = pd.read_csv(sa_community_data_file_path)

    print('google analytics data')
    display(df_google_analytics_cleaned.head())

    print('SA community data')
    display(df_sacommunity_data.head())

    return get_combined_data(df_grp_org_id, df_sacommunity_data, df_google_analytics_cleaned)
    

In [31]:
landing_page_file_path = './data/Burnside Council Landing page 2021-2022.xlsx'
sa_community_data_file_path = './data/Burnside Council-cu_export_2023-08-16_Data.Gov.au_export.csv'

data_df = data_preprocessing(landing_page_file_path, sa_community_data_file_path)
data_df

google analytics data


  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Landing Page,organization_id_name,organization_id,organization_name,Sessions
0,/org/201829-Cats_Assistance_To_Sterilise_Inc.,201829-Cats Assistance To Sterilise Inc.,201829,Cats Assistance To Sterilise Inc.,632
1,/org/202703-Drug_and_Alcohol_Services_SA_-_Wit...,202703-Drug and Alcohol Services SA - Withdraw...,202703,Drug and Alcohol Services SA - Withdrawal Serv...,295
2,/org/236722-Justices_of_the_Peace_-_City_of_Bu...,236722-Justices of the Peace - City of Burnside,236722,Justices of the Peace - City of Burnside,237
3,/org/201950-SA_Ambulance_Service,201950-SA Ambulance Service,201950,SA Ambulance Service,234
4,/org/201612-Dementia_Australia,201612-Dementia Australia,201612,Dementia Australia,214


SA community data


Unnamed: 0,ID_19,Org_name,Street_Address_Line_1,Street_Address_Line_2,Suburb,State,Postal_Code,Country,Postal_Address_Line_1,Postal_Address_Line_2,...,Organisati_Eligibility,Organisati_Services,Organisation_Created_Date,Organisation_Last_updated,IM_Screen_Name_1,IM_Screen_Name_2,IM_Screen_Name_3,IM_Screen_Name_4,IM_Screen_Name_5,IM_Screen_Name_6
0,194023,Motorcycling South Australia Inc.,"Motorcycling South Australia Inc., 251 The Pde",,Beulah Park,South Australia,5067.0,Australia,,,...,,Coordination and promotion of motorcycle sport...,2009-11-23 21:44:28,2022-11-16 11:54:08,https://www.facebook.com/motorcyclingsa/ 7,,,,,
1,194025,Orienteering SA,,,Glenside,South Australia,5065.0,Australia,c/o 5/355 Angas St,,...,,Coordination and promotion of orienteering in ...,2009-11-23 21:44:28,2022-07-24 15:14:31,https://www.facebook.com/OrienteeringSA 7,https://www.youtube.com/channel/UCSrZVpB1et3Jo...,,,,
2,194813,Burnside Library & Information Service,401 Greenhill Rd,,Tusmore,South Australia,5065.0,Australia,PO Box 9,,...,,Public library\r\nHome Energy Toolkit availabl...,2009-11-23 21:44:58,2022-12-07 10:38:10,BurnsideLibrary 7,BurnsideLibrary 4,burnsidelibrary 1,,,
3,196167,Burnside Family Church,88 Lockwood Rd,,Burnside,South Australia,5066.0,Australia,,,...,,Sunday Service 10am\r\nFri night kid's club an...,2009-11-23 21:45:53,2023-01-25 13:53:04,burnsidefamilychurch 7,BFamilyChurch 4,,,,
4,196171,Linden Park Primary School and OSHC/VAC,"Linden Park Primary School, 14 Hay Rd",,Linden Park,South Australia,5065.0,Australia,,,...,,Primary education - Reception to Year 7\r\nOut...,2009-11-23 21:45:53,2023-01-25 14:06:43,https://www.facebook.com/groups/3219860600/abo...,,,,,


Unnamed: 0,org_id,sessions_count,organization_name_sa_community,organization_name_google,is_record_available_in_sacommunity_db
0,194813,34,Burnside Library & Information Service,Burnside Library & Information Service,True
1,196167,11,Burnside Family Church,Burnside Family Church,True
2,196171,30,Linden Park Primary School and OSHC/VAC,Linden Park Primary School and OSHC VAC,True
3,196173,4,Burnside Primary School and OSHC/Vac Care,Burnside Primary School and OSHC Vac Care,True
4,196174,4,Rose Park Primary School,Rose Park Primary School,True
...,...,...,...,...,...
200,236280,58,Italian Folk Ensemble,Italian Folk Ensemble,True
201,236332,1,,Gentle Touch Orthodontics,False
202,236722,237,Justices of the Peace - Burnside,Justices of the Peace - City of Burnside,True
203,237283,7,Salvos Stores - Kensington Gardens,Salvos Stores - Kensington Gardens,True


In [32]:
# these records are problematic, they are found in google analytics, but not in sacommunity council based export 
data_df[data_df["is_record_available_in_sacommunity_db"] == False]

Unnamed: 0,org_id,sessions_count,organization_name_sa_community,organization_name_google,is_record_available_in_sacommunity_db
20,196208,19,,Zonta Club of Adelaide Torrens Inc.,False
35,196237,28,,Australian Girls Choir,False
68,196316,7,,Burnside Residents Action Group (Status unknow...,False
84,197170,38,,Rural & Remote Mental Health Service,False
105,201253,4,,St Matthew's Homes Inc.,False
109,201553,16,,Cancer Council SA,False
114,201673,14,,SPELD SA Inc.,False
129,202376,16,,Wendy's Nursery School and ELC,False
137,202837,5,,Rare Fruit Society SA Inc.,False
140,203109,19,,Natural Resources Adelaide & Mount Lofty Ranges,False


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

from selenium import webdriver
from chromedriver_py import binary_path # this will get you the path variable



# # deprecated but works in older selenium versions
# # driver = webdriver.Chrome(executable_path=binary_path)
# driver.get("http://www.python.org")
# assert "Python" in driver.title

def test_eight_components():
    driver = webdriver.Chrome(executable_path=binary_path)

    driver.get("https://www.selenium.dev/selenium/web/web-form.html")

    title = driver.title
    assert title == "Web form"

    driver.implicitly_wait(10) # 0.5

    text_box = driver.find_element(by=By.NAME, value="my-text")
    submit_button = driver.find_element(by=By.CSS_SELECTOR, value="button")

    text_box.send_keys("Selenium")
    submit_button.click()

    message = driver.find_element(by=By.ID, value="message")
    value = message.text
    assert value == "Received!"

    driver.quit()

test_eight_components()

  driver = webdriver.Chrome(executable_path=binary_path)


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from chromedriver_py import binary_path # this will get you the path variable

async def get_text_by_xpath(driver, x_path):
    return await driver.find_element(by=By.CSS_SELECTOR, value=x_path).text

driver = webdriver.Chrome(executable_path=binary_path)

driver.get("https://lga-sa.maps.arcgis.com/apps/instant/lookup/index.html?appid=db6cce7b773746b4a1d4ce544435f9da&find=130%20L%27Estrange%20Street%2C%20Glenunga")

# driver.implicitly_wait(60)
print('page source ', driver.page_source)
# //*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[1]/th
# h1 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[1]/th')
# v1 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[1]/td')
# h2 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[2]/th')
# v2 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[2]/td')
# h3 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[3]/th')
# v3 = driver.find_element(by=By.XPATH, value='//*[@id="FindMyCouncil_944448"]/div/div/div/div/div/div/table/tbody/tr[3]/td')
print('getting element for council')
print(await get_text_by_xpath(driver, '#FindMyCouncil_944448 > div > div > div > div > div > div > table > tbody > tr:nth-child(1) > th'))
print()
# print(v1.text)

driver.quit

  driver = webdriver.Chrome(executable_path=binary_path)


page source  <html class="hydrated" calcite-hydrated=""><head>
  <meta charset="utf-8"><style data-styles="">instant-apps-interactive-legend,instant-apps-export,instant-apps-interactive-legend-group-legend-element-caption,instant-apps-measurement,instant-apps-control-panel,instant-apps-filter-list,instant-apps-keyboard-shortcuts,instant-apps-popover,instant-apps-popovers,instant-apps-scoreboard,instant-apps-social-share,instant-apps-interactive-legend-layer-element-caption,instant-apps-interactive-legend-count,instant-apps-interactive-legend-group-legend-element,instant-apps-interactive-legend-layer-element,instant-apps-interactive-legend-legend-element,instant-apps-interactive-legend-relationship,instant-apps-interactive-legend-legend-element-caption,instant-apps-interactive-legend-classic,instant-apps-header,instant-apps-measurement-tool{visibility:hidden}.hydrated{visibility:inherit}</style><style data-styles="">calcite-icon,calcite-value-list,calcite-value-list-item,calcite-list,ca

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"#FindMyCouncil_944448 > div > div > div > div > div > div > table > tbody > tr:nth-child(1) > th"}
  (Session info: chrome=117.0.5938.92)
Stacktrace:
0   chromedriver_mac-arm64              0x0000000101112d98 chromedriver_mac-arm64 + 4337048
1   chromedriver_mac-arm64              0x000000010110ae14 chromedriver_mac-arm64 + 4304404
2   chromedriver_mac-arm64              0x0000000100d37a5c chromedriver_mac-arm64 + 293468
3   chromedriver_mac-arm64              0x0000000100d7cd50 chromedriver_mac-arm64 + 576848
4   chromedriver_mac-arm64              0x0000000100db7908 chromedriver_mac-arm64 + 817416
5   chromedriver_mac-arm64              0x0000000100d70a5c chromedriver_mac-arm64 + 526940
6   chromedriver_mac-arm64              0x0000000100d71908 chromedriver_mac-arm64 + 530696
7   chromedriver_mac-arm64              0x00000001010d8de4 chromedriver_mac-arm64 + 4099556
8   chromedriver_mac-arm64              0x00000001010dd2a0 chromedriver_mac-arm64 + 4117152
9   chromedriver_mac-arm64              0x00000001010e352c chromedriver_mac-arm64 + 4142380
10  chromedriver_mac-arm64              0x00000001010ddda0 chromedriver_mac-arm64 + 4119968
11  chromedriver_mac-arm64              0x00000001010b5a74 chromedriver_mac-arm64 + 3955316
12  chromedriver_mac-arm64              0x00000001010faa48 chromedriver_mac-arm64 + 4237896
13  chromedriver_mac-arm64              0x00000001010fabc4 chromedriver_mac-arm64 + 4238276
14  chromedriver_mac-arm64              0x000000010110aa8c chromedriver_mac-arm64 + 4303500
15  libsystem_pthread.dylib             0x00000001865affa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x00000001865aada0 thread_start + 8
