Imports and function to get the population of the departments

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
import geopandas


def crawlpop():
    wikiurl = 'https://en.wikipedia.org/wiki/List_of_Colombian_departments_by_population'
    table_class="wikitable sortable jquery-tablesorter"
    response=requests.get(wikiurl)
    #print(response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')
    coltable=soup.find('table',{'class':"wikitable"})
    df=pd.read_html(str(coltable))
    df=pd.DataFrame(df[0])
    #print(df.head())
    dfpop = pd.DataFrame(columns=["dep","population"])
    dfpop["dep"] = df["Department"]
    dfpop["population"] = df["Population (2020)[1]"]
    #print(dfpop)
    dfpop.to_csv("data/temppop.csv")

crawlpop()

Preparing the data

In [3]:
df = pd.read_csv("data/DV.csv", low_memory= False)
#taken from https://www.kaggle.com/datasets/oscardavidperilla/domestic-violence-in-colombia
df2 = pd.read_csv("data/temppop.csv", encoding="utf-8")
df = df[df.GENERO != "NO REPORTA"]
df = df[df.GENERO != "-"]
df = df[df.MUNICIPIO != "NO REPORTA"]
df = df[df["ARMAS MEDIOS"] != "NO REPORTA"]
df = df[df["ARMAS MEDIOS"] != "NO REPORTADO"]
df = df[df["ARMAS MEDIOS"] != "-"]
df = df.dropna()
df["DEPARTAMENTO"] = df["DEPARTAMENTO"].apply(lambda x: unidecode(x))
df.loc[df["MUNICIPIO"] == "BOGOTÁ D.C. (CT)", "DEPARTAMENTO"] = "SANTAFE DE BOGOTA D.C"
deps = []
#deps = [['ATLÁNTICO', 'BOYACÁ', 'CAQUETÁ', 'CASANARE', 'CUNDINAMARCA', 'SUCRE', 'VALLE', 'HUILA', 'ANTIOQUIA', 'ARAUCA', 'BOLÍVAR', 'CALDAS', 'CAUCA', 'CESAR', 'CHOCÓ', 'CÓRDOBA', 'MAGDALENA', 'META', 'NARIÑO', 'NORTE DE SANTANDER', 'PUTUMAYO', 'RISARALDA', 'SANTANDER', 'TOLIMA', 'VAUPÉS', 'GUAVIARE', 'GUAJIRA', 'QUINDÍO', 'AMAZONAS', 'VICHADA', 'GUAINÍA', 'SAN ANDRÉS', 'NO REPORTA']]
for x in df["DEPARTAMENTO"]:
   if x not in deps:
        deps.append(x)
  
df2.dep = df2.dep.str.upper()

#df2["population"] = df2["population"].replace([df2[df2["dep"] == "CUNDINAMARCA"]["population"]], df2[df2["dep"] == "CUNDINAMARCA"]["population"].values + df2[df2["dep"] == "BOGOTÁ"]["population"].values)
#df2 = df2.drop(df2[df2["dep"] == "BOGOTÁ"].index)
df2["dep"] = df2["dep"].replace("BOGOTÁ", "SANTAFE DE BOGOTA D.C")
df2["dep"] = df2["dep"].replace("LA GUAJIRA", "GUAJIRA")
df2["dep"] = df2["dep"].replace("SAN ANDRÉS Y PROVIDENCIA", "SAN ANDRES")
df2["dep"] = df2["dep"].replace("VALLE DEL CAUCA", "VALLE")
df2["dep"] = df2["dep"].apply(lambda x: unidecode(x))

Create a list that for each department divides the set of cases into different factors

In [4]:
a = []
for x in df:
    if x == "ARMAS MEDIOS" or x == "GENERO" or x == "GRUPO ETARIO":
        for y in df[x].unique():
            dftemp = df[df[x] == y]
            dftemp = dftemp["DEPARTAMENTO"].value_counts().to_dict()
            match x:
                case "ARMAS MEDIOS":
                    z = "Weapon_of_crime"
                    match y:
                        case "ARMA BLANCA / CORTOPUNZANTE":
                            z2 = "stabbing_weapon"
                        case "ARMA DE FUEGO":
                            z2 = "firearm"
                        case "CONTUNDENTES":
                            z2 = "blunt_weapon"
                        case "CORTANTES":
                            z2 = "slashing_weapon"
                        case "CORTOPUNZANTES":
                            z2 = "sharp_weapon2"
                        case "PUNZANTES":
                            z2 = "sharp_weapon"
                        case "SIN EMPLEO DE ARMAS":
                            z2 = "no_weapon" 
                        case "ESCOPOLAMINA":
                            z2 = "narcotics"   
                case "GENERO":
                    z = "gender"
                    match  y:
                        case "MASCULINO":
                            z2 = "male" 
                        case "FEMENINO":
                            z2 = "female"   
                case "GRUPO ETARIO":
                    z = "agegroup"
                    match y:
                        case "ADULTOS":
                            z2 = "adult"
                        case "ADOLESCENTES":
                            z2 = "teenager"
                        case "MENORES":
                            z2 = "minor"

            #for key in dftemp.keys():
                #dftemp[key] = (dftemp[key], z, z2)
            dftemp = [z, z2, dftemp]
        #dftemp.append(y)
            a.append(dftemp)


Check if the Data is correct

In [5]:
dep1 = sorted(deps)
dep2 = df2.sort_values("dep").reset_index()["dep"]

depvergleich = pd.DataFrame(columns=["dep1" , "dep2"])

depvergleich["dep1"] = dep1
depvergleich["dep2"] = dep2

for i in range(len(dep1)):
    if dep1[i] != dep2[i]:
        print(i, dep1[i], dep2[i])

writes data into df.csv

In [7]:
df.to_csv("data/dv_data.csv")

Creates dataframe with the number of cases as well as the population of each department, as well as a normalized number of cases (cases / (population/1000)). And writes the Dataframe into data.csv

In [8]:

#b = df.groupby("DEPARTAMENTO").count()
dft = df["DEPARTAMENTO"].value_counts().to_dict().items()

data = pd.DataFrame(columns = ["dep", "cases", "population"])

dep = []
cases = []
population = []
norm = []

for x,y in dft:
    depart = unidecode(x)
    dep.append(depart)
    cases.append(y)
    p = int(df2[df2["dep"] == x].population)
    population.append(p)
    norm.append(round((y/(p/1000)), 2))

for i in range(len(a)):
    sublist = []
    var1 = a[i][0]
    var2 = a[i][1]
    for x,y in dft:
        if x in a[i][2]:
            varcas = a[i][2][x]
        else:
            varcas = 0
        sublist.append(varcas)
    #would write all combinations of the chosen factors from a into the dataframe
    #data["{}_{}".format(var1, var2)] = sublist


data["dep"] = dep
data["cases"] = cases
data["population"] = population
#data["norm"] = norm

data.to_csv("data/popdata.csv")

  p = int(df2[df2["dep"] == x].population)


Write all the factors into the geojson as properties (not needed on the current version)

In [10]:
with open("data/col.geojson") as file:
    dfgeo = geopandas.read_file(file)

#norm2 = []
cases2 = []
population2 = []
for index, row in dfgeo.iterrows():
    if row["NOMBRE_DPT"] in dep:
        #n = float(data[data["dep"] == row["NOMBRE_DPT"]].norm)
        c = int(data[data["dep"] == row["NOMBRE_DPT"]].cases)  
        p = int(data[data["dep"] == row["NOMBRE_DPT"]].population)
    else:
        n = 0.0
        c = 0
        p = 0
    #norm2.append(n)
    cases2.append(c)
    population2.append(p)

#dfgeo["norm"] = norm2
dfgeo["cases"] = cases2
dfgeo["population"] = population2
for i in range(len(a)):
    sublist = []
    var1 = a[i][0]
    var2 = a[i][1]
    for index, row in dfgeo.iterrows():
        if row["NOMBRE_DPT"] in a[i][2]:
            varcas = a[i][2][row["NOMBRE_DPT"]]
        else:
            varcas = 0
        sublist.append(varcas)
    dfgeo["{}__{}".format(var1, var2)] = sublist

#dfgeo.to_file("data/col_with_attributes.geojson", driver='GeoJSON')

  c = int(data[data["dep"] == row["NOMBRE_DPT"]].cases)
  p = int(data[data["dep"] == row["NOMBRE_DPT"]].population)
