In [None]:
#Python 3.11.2
#Import packages
import pandas as pd
import numpy as np
import pylab as pl
import random
import matplotlib.pyplot as plt
from collections import defaultdict
import json # for pretty printing
import geopandas as gpd
import os

We have three different data sources. 

1. The data collected by Bodil corresponds to the plague period.
2. The information from the TABVERK database includes the population size for parishes in the posterior years of the plague.
3. The geographical information (polygons) for some parishes. This information doesn't correspond to the plague period.

Our goal is to create a unique database for our project: Plague spread across Scania, Sweden, from 1710 to 1715.

In [None]:
# Set the working directory
script_dir_Appendix6 = os.path.dirname(os.path.abspath("Appendix6Bodil.csv"))
script_dir_allParishes = os.path.dirname(os.path.abspath("allParishesScania.xlsx"))

# File paths relative to the script directory
appendix6_path = os.path.join(script_dir_Appendix6, "Appendix6Bodil.csv")
allParishes_path = os.path.join(script_dir_allParishes, "allParishesScania.xlsx")

First we read the different data sources (.xlsx, .csv, and .shp files)

In [None]:
# Bodil's data Appendix 6 plague parishes
plagueParishesScania = pd.read_csv(appendix6_path, sep=",", encoding="utf-8")
# All parishes in Scania during the plague period
allParishesScania = pd.read_excel(allParishes_path)


Transforming the lowercase to uppercase and checking the type

In [None]:
allParishesScania = allParishesScania.apply(
    lambda x: x.astype(str).str.upper())
plagueParishesScania = plagueParishesScania.apply(
    lambda x: x.astype(str).str.upper())
type(plagueParishesScania)
type(allParishesScania)


Visualizing the DataFrames

In [None]:
#allParishesScania.head()

In [None]:
#plagueParishesScania.head()

Merging the two datasets (allParishesScania and plagueParishesScania)

In [None]:
parishesScania = pd.merge(
    allParishesScania, plagueParishesScania, how='left', on=['ParishName', 'Region'])


Checking that the new data frame keep all the outbreaks for parish

In [None]:
#parishesScania.loc[parishesScania['ParishName'] == 'NÄSUM']

Defining a function for extracting the names of the parishes in the data frame

In [None]:
def get_Names(data: pd.DataFrame, heading:str) -> list:
    return data[heading].tolist()   

Filtering the data frame by region and then get the names of the parishes:

In [None]:
parishesScania_names = get_Names(parishesScania, 'ParishName')
len(parishesScania_names)

In [None]:
southeastParishes = allParishesScania.loc[allParishesScania['Region'] == 'SOUTHEAST']
southeastParishes_names = get_Names(southeastParishes, 'ParishName')

Reading the census file:

In [None]:
# Set the working directory
file_name = 'FILE01_FALD.txt'
folder_name = 'CensusScania'
census_path = os.path.join(os.path.expanduser(
    "~"), "Desktop", folder_name, file_name)

censusSweden = pd.read_csv(census_path, sep=';')
censusSweden.shape


Checking the memory usage

In [None]:
#censusSweden.info(memory_usage='deep')

Checking the names of all columns in the data

In [None]:
columns = censusSweden.columns

Calling the data only with specific columns

In [None]:
censusSweden = pd.read_csv(census_path, sep=';', usecols=[
                           'LANGENNMN'  # Standard name of the county for the geographical area in plain text
                           , 'GEOIDNMN'  # Standard name of the geographical area in plain text, i.e. not a source name
                           , 'GEOIDTYP'  # Type of breakdown of the geographical area  0 =Assembly, 1 = Pastorate, 2 = Other type, 3 = Several parishes, 9 = Part of a parish
                           , 'AR'  # Year
                           , 'KON'  # 1 = Man  2 = Female. I choose 1 but it could be 2 for the total population
                           , 'BEF_TOT'  # Total population at source
                           , 'BEF_GENTOT'  # Total population, generated
                           ])


Processing the census data such that corresponds only to Scania and the first population size registred for each parish.

In [None]:
censusScania = censusSweden.loc[((censusSweden['LANGENNMN'] == 'KRISTIANSTADS LÄN') | (
    censusSweden['LANGENNMN'] == 'MALMÖHUS LÄN')) & (censusSweden['KON'] == 1)]
censusScania.shape

Process the data from Scania only to keep the first population size registered for each parish.
This was done following two approaches: the first one by grouping the data by parish name and then selecting the minimum year. The second approach is exhaustively exploring the given DataFrame and keeping the required information in a dictionary. The information required in our case corresponds to the position associated with each parish name and the minimum year, according to the original DataFrame. 

1. First approach:

In [None]:
# %%timeit
#Group a Pandas DataFrame by a column
parish_grp = censusScania.groupby(['GEOIDNMN'])
# Get the unique values of a column as a list
parish_grp_name = parish_grp['GEOIDNMN'].unique().tolist()
#Get the unique values of a column as a list of strings
parish_names = [parish_name[0] for parish_name in parish_grp_name]
#print(parish_names)

popSizeScania = pd.DataFrame()

for name in parish_grp_name:
    grp_name = parish_grp.get_group(name[0])
    popSizeScania = pd.concat(
        [popSizeScania, (grp_name[grp_name.AR == grp_name.AR.min()])], axis=0)
 
print(popSizeScania.shape)

2. Second approach

In [None]:
#%%timeit
aux_dict = {}
popSizeScania = pd.DataFrame() 

for i in range(len(censusScania)):
    name_i = censusScania['GEOIDNMN'].iloc[i]
    ar_i = censusScania['AR'].iloc[i]
    if name_i in aux_dict:
        if ar_i < aux_dict[name_i]['min']:
            aux_dict[name_i] = {'min': ar_i, 'position': i}
    else:
        aux_dict[name_i] = {'min': ar_i, 'position': i}
final_positions = [value['position'] for key, value in aux_dict.items()]
popSizeScania = censusScania.iloc[final_positions]   
print(popSizeScania.shape)            
    

In [None]:
#popSizeScania.loc[popSizeScania['GEOIDNMN'] == 'STORA KÖPINGE']

In [None]:
#popSizeScania.iloc[100:150]

In [None]:
popSizeScania_names = get_Names(popSizeScania, 'GEOIDNMN')

In [None]:
def delete_strings(data: pd.DataFrame, column: str, list_expr: list[str]) -> pd.DataFrame:
    if isinstance(data, pd.DataFrame):
        for expr in list_expr:
            data = data.apply(lambda x: x.replace({expr: ''}, regex=True))
    else:
        for expr in list_expr:
            data[column] = data[column].replace({expr: ''}, regex=True)
    return data


Reading the geographical information by parish

In [None]:
# Get the current working directory
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
shapefile_directory = os.path.join(parent_directory, "MapScaniaSweden")

parishScania_path = os.path.join(shapefile_directory, "Parishes1720_1890.shp")
parishScaniaMap = gpd.read_file(parishScania_path)
selected_columns = ['G_NAME', 'geometry']
parishScaniaMap = parishScaniaMap[selected_columns]

In [None]:
parishScaniaMap = delete_strings(
    parishScaniaMap,'', ['FÖRSAMLING'
                      , 'L LÄN'
                      , 'S LÄN'
                      , 'HELIGA TREFALDIGHETS'
                      ])

In [None]:
parishScaniaMap.head(10)

In [None]:
parishScaniaMap_names = get_Names(parishScaniaMap, 'G_NAME')

In [None]:
def check_name(data: pd.DataFrame, column: pd.Series, name: str):
    filt_name = column.str.contains(name, na=False)
    return data.loc[filt_name]

In [None]:
def filter_data_by_name(data: pd.DataFrame, column: pd.Series, input_names: list[str]):
    output_names = []
    for name in input_names:
        filter_data = check_name(data, column, name)
        if len(filter_data) == 0 :
            output_names  = output_names  + [name]
        else:
            continue
    return output_names

In [None]:
print(len(filter_data_by_name(popSizeScania, popSizeScania['GEOIDNMN'], parishesScania_names)))

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    print(southeastParishesNoPop)
    

In [None]:
filter_data_by_name(parishScaniaMap, parishScaniaMap['G_NAME'], popSizeScania_names)

In [None]:
censusScania = delete_strings(censusScania, 'GEOIDNMN',[', DEL KRISTIANSTAD'
                                , 'PASTORAT'
                                , ', DEL AV (FROSTA HÄRAD, MALMÖHUS LÄN)'
                                , 'GARNISONSFÖRS.'
                                , 'SLOTTSFÖRSAMLING'
                                , '(MALMÖ SF)'])

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

In [None]:
southeastParishesNoPop = []

for name in southeastParishes_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        southeastParishesNoPop = southeastParishesNoPop + [name]
    else:
        continue
    #print(southeastParishesNoPop)
    

Check regular expressions code in Mathematica for eliminATING SOME STRINGS

In [None]:

parishesScaniaNoPop = []

for name in parishesScania_names:
    filter_data = check_name(popSizeScania, popSizeScania['GEOIDNMN'], name)
    if len(filter_data) == 0 :
        parishesScaniaNoPop  = parishesScaniaNoPop  + [name]
    else:
        continue

    

In [None]:
check_name(popSizeScania, popSizeScania['GEOIDNMN'], 'YSTAD')
check_name(popSizeScania, popSizeScania['GEOIDNMN'], 'OLOFS')


In [None]:
type(censusScania['GEOIDNMN'])

In [None]:
filt2 = popSizeScania['GEOIDNMN'].str.contains('NORRA RÖRUM', na=False)
#popSizeScania.loc[filt2]

In [None]:
print(popSizeScania[['GEOIDNMN','AR']].iloc[2])