In [258]:
import pandas as pd

In [259]:
suffixes_to_remove = ["?fbclid=", "+&", "?_x_tr_", "?back="]
search_cache_identifier = "/search?q=cache:"
sacommunity_url = "https://sacommunity.org"

def clean_landing_page_column(text: str) -> str:
    if search_cache_identifier in text:
        text = text[text.index(sacommunity_url):].replace(sacommunity_url, "")

    for suffix_to_remove in suffixes_to_remove:
        if suffix_to_remove in text:
            text = text[:text.index(suffix_to_remove)]

    # remove underscore
    text = text.replace("_", " ")
    # remove /org/
    text = text.replace("/org/", "")

    return text.strip()

def get_organization_id(text: str) -> str:
    if "-" in text:
        return int(text[:text.index("-")])
    else:
        return None
    
def get_organization_name(text: str) -> str:
    if "-" in text:
        return text[text.index("-") + 1:]
    else:
        return None

# test texts
inputs = [
    "/org/196236-Dave's_Angels_Playgroup?fbclid=IwAR05WAQ0z5mwY7v1UEVmkDITFg7sDh8pcD8taJ3oGH4336EpkNZeP81BIKc",
    "/search?q=cache:UTs_a-1ZNgEJ:https://sacommunity.org/org/196341-Neighbourhood_Watch_-_Linden_Park_249+&cd=63&hl=en&ct=clnk&gl=bj",
    "/org/201669-Gifted_&_Talented_Children's_Association_of_SA_Inc.?_x_tr_sl=en&_x_tr_tl=th&_x_tr_hl=th&_x_tr_pto=sc",
    "/org/201830-Aged_Rights_Advocacy_Service_Inc.?back=https://www.google.com/search?client=safari&as_qdr=all&as_occt=any&safe=active&as_q=Age+advocate+for+South+Australia&channel=aplab&source=a-app1&hl=en",
    "/org/201950-SA_Ambulance_Service?_x_tr_sl=en&_x_tr_tl=fr&_x_tr_hl=fr&_x_tr_pto=nui,sc"
]

for input in inputs:
    print(clean_landing_page_column(input))
    

196236-Dave's Angels Playgroup
196341-Neighbourhood Watch - Linden Park 249
201669-Gifted & Talented Children's Association of SA Inc.
201830-Aged Rights Advocacy Service Inc.
201950-SA Ambulance Service


In [260]:
def clean_and_extract_organization(df_ga_orig: pd.DataFrame) -> pd.DataFrame:
    df_ga = df_ga_orig.dropna().copy()
    df_ga['organization_id_name'] = df_ga['Landing Page'].apply(clean_landing_page_column)
    df_ga['organization_id'] = df_ga['organization_id_name'].apply(get_organization_id)
    df_ga['organization_name'] = df_ga['organization_id_name'].apply(get_organization_name)
    return df_ga[["Landing Page", "organization_id_name","organization_id","organization_name", "Sessions"]]

In [283]:
def get_combined_data(df_org_id_and_session, df_sacommunity):
    results = []
    for index, row in df_org_id_and_session.iterrows():
        org_id = index
        session_count = row["Sessions"]
        
        org_names = df_sacommunity[df_sacommunity['ID_19'] == org_id]["Org_name"].values
        organization_name = ''
        is_record_available_in_sacommunity_db = False
        if len(org_names) > 0:
            organization_name = org_names[0]
            is_record_available_in_sacommunity_db = True
       
        results.append({
            'org_id': org_id,
            'sessions_count': session_count,
            'organization_name': organization_name,
            'is_record_available_in_sacommunity_db': is_record_available_in_sacommunity_db,
        })

    return pd.DataFrame(results)

In [284]:
def data_preprocessing(landing_page_filepath, sa_community_data_file_path):
    df_google_analytics = pd.read_excel(landing_page_file_path, sheet_name='Dataset1')
    df_cleaned = clean_and_extract_organization(df_google_analytics)
    df_grp_org_id = df_cleaned.groupby(by=['organization_id']).sum("Sessions")

    df_sacommunity_data = pd.read_csv(sa_community_data_file_path)

    return get_combined_data(df_grp_org_id, df_sacommunity_data)
    

In [285]:
landing_page_filepath = './data/Burnside Council Landing page 2021-2022.xlsx'
sa_community_data_file_path = './data/Burnside Council-cu_export_2023-08-16_Data.Gov.au_export.csv'

data_df = data_preprocessing(landing_page_filepath, sa_community_data_file_path)
data_df

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,org_id,sessions_count,organization_name,is_record_available_in_sacommunity_db
0,194813,34,Burnside Library & Information Service,True
1,196167,11,Burnside Family Church,True
2,196171,30,Linden Park Primary School and OSHC/VAC,True
3,196173,4,Burnside Primary School and OSHC/Vac Care,True
4,196174,4,Rose Park Primary School,True
...,...,...,...,...
200,236280,58,Italian Folk Ensemble,True
201,236332,1,,False
202,236722,237,Justices of the Peace - Burnside,True
203,237283,7,Salvos Stores - Kensington Gardens,True


In [286]:
# these records are problematic, they are found in google analytics, but not in sacommunity council based export 
data_df[data_df["is_record_available_in_sacommunity_db"] == False]

Unnamed: 0,org_id,sessions_count,organization_name,is_record_available_in_sacommunity_db
20,196208,19,,False
35,196237,28,,False
68,196316,7,,False
84,197170,38,,False
105,201253,4,,False
109,201553,16,,False
114,201673,14,,False
129,202376,16,,False
137,202837,5,,False
140,203109,19,,False
