In [6]:
import pandas as pd

In [7]:
def clean_df(df, extra_col):
    # Drop first row (empty)
    df = df.drop(df.index[0])

    # Add year column name
    df.iloc[0, 0] = "year"

    # Make years the header
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])

    # Drop all rows starting with Nan
    df = df.dropna(subset=[df.columns[0]])

    # # Remove indicator group titles
    patterns_to_remove = [
        "Utemiljö",
        "Missbruksproblem", 
        "Utomhusstörningar",
        "Andel uppfattat minst ett problem",
        "Utsatthet för brott",
        "Oro för att utsättas för brott",
        "Konkret känsla av otrygghet",
        "Polisens agerande mot problem",
        "Tillit"
    ]
    mask = ~df.iloc[:, 0].astype(str).str.startswith(tuple(patterns_to_remove))
    df = df[mask]

    # Transpose df
    df = df.transpose()

    # Make the first row the header
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])

    # Reset index
    df = df.reset_index()

    # Rename first column to "year"
    df.columns.values[0] = "year"

    # Remove rows with 'Year' == '2020_1' or '2016_1'
    df = df[~df['year'].isin(['2020_1', '2016_1'])]

    # Rename '2020_2' to '2020' and '2016_2' to '2016', etc
    df['year'] = df['year'].replace({'2020_2': '2020', '2016_2': '2016', '2006*': '2006'})
    
    # Rename special characters ä, å and ö
    df.columns = df.columns.str.replace('ä', 'a').str.replace('å', 'a').str.replace('ö', 'o')

    # Rename columns to snake_casing for easier coding
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '').str.replace(',', '').str.lower()

    # Transform years to numbers
    df["year"] = df["year"].astype(int)

    # Transform numeric columns to numbers
    df = df.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',', '.'), errors="coerce") if x.dtype == 'object' else x)

    # Add extra column
    if isinstance(extra_col, dict):
        for col_name, col_value in extra_col.items():
            df.insert(0, col_name, col_value)

    return df

In [8]:
helsingborg_raw = pd.read_csv('Resultatbild - LPO Helsingborg.csv', sep=";")
kristianstad_raw = pd.read_csv('Resultatbild - LPO Kristianstad.csv', sep=";")
landskrona_raw = pd.read_csv('Resultatbild - LPO Landskrona.csv', sep=";")
lund_raw = pd.read_csv('Resultatbild - LPO Lund.csv', sep=";")
lund_raw = pd.read_csv('Resultatbild - LPO Lund.csv', sep=";")
malmo_raw = pd.read_csv('Resultatbild - PO Malmo.csv', sep=";")

# Clean all raw data
helsingborg_df = clean_df(helsingborg_raw, {'municipality': 'Helsinborg'})
kristianstad_df = clean_df(kristianstad_raw, {'municipality': 'Kristiansstad'})
landskrona_df = clean_df(landskrona_raw, {'municipality': 'LandsKrona'})
lund_df = clean_df(lund_raw, {'municipality': 'Lund'})
malmo_df = clean_df(malmo_raw, {'municipality': 'Malmö'})

# Join clean dfs
df = pd.concat([
    helsingborg_df,
    kristianstad_df,
    landskrona_df,
    lund_df,
    malmo_df
])


print(f'{df.size} entries')

# Sample df
df.sample(20)

3485 entries


year,municipality,year.1,nedskrapning,skadegorelse,berusade_personer_utomhus,narkotikapaverkade_personer_utomhus,bostader_tillhall_for_alkoholmissbrukare,bostader_tillhall_for_narkotikamissbrukare,observerat_narkotikaforsaljning_i_omradet,folk_brakar_och_slass_utomhus,...,restaurang_bar_eller_disco,sportevenemang,foreningsmoten_kurser_och_liknande,åka_buss_eller_tag,andel_avstatt_fran_nagon_typ_av_aktivitet,polisen_bryr_sig_om_de_lokala_problemen,polisen_bryr_sig_inte_om_de_lokala_problemen,relationskvot,boende_skulle_ej_agera_vid_slagsmal,svag_sammanhallning_i_bostadsomradet
9,Helsinborg,2016,42.09,40.93,22.05,17.86,8.89,9.9,,15.86,...,10.77,12.34,4.86,13.72,25.94,47.22,15.32,32.45,,
13,Lund,2010,30.23,33.22,13.26,9.22,5.04,4.38,,7.31,...,3.62,1.57,1.64,7.19,9.45,43.71,11.19,25.6,,
3,LandsKrona,2021,36.95,29.82,12.78,12.46,4.61,5.94,8.12,11.09,...,8.89,4.26,3.64,11.19,15.4,49.44,12.67,25.62,17.79,11.41
16,LandsKrona,2008,36.29,40.65,22.69,11.22,9.71,8.22,,18.67,...,12.46,3.91,5.27,11.98,22.55,38.32,20.82,54.35,,
15,Kristiansstad,2009,30.78,32.36,16.02,9.61,6.66,6.98,,11.62,...,5.09,2.76,2.48,8.68,12.34,47.12,12.18,25.84,,
6,LandsKrona,2019,40.06,34.74,15.14,14.07,7.32,8.61,,12.4,...,11.46,5.46,5.55,14.4,20.28,49.17,14.42,29.32,19.15,13.08
1,Helsinborg,2023,42.11,39.27,16.53,18.59,8.28,10.0,18.28,12.72,...,9.4,7.35,3.4,14.02,20.47,53.98,11.24,20.83,19.76,14.55
8,Malmö,2017,47.16,39.82,21.2,21.7,9.84,11.24,,16.92,...,10.03,6.83,5.52,13.95,20.67,49.83,14.14,28.37,22.66,18.44
12,Helsinborg,2012,41.1,37.88,16.46,11.33,7.39,7.7,,10.5,...,5.05,3.6,3.56,9.34,15.28,43.84,12.47,28.44,,
7,Lund,2017,35.28,31.07,12.8,8.91,3.98,4.2,,7.32,...,5.04,2.75,2.42,8.29,11.96,43.18,11.25,26.05,14.63,14.14


In [9]:
df.to_csv('recurring.csv')