In [1]:
#Python 3.11.2
#Import packages
import pandas as pd
import numpy as np
import pylab as pl
import random
import matplotlib.pyplot as plt
from collections import defaultdict
import json # for pretty printing
import geopandas as gpd

We have three different data sources. 

1. The data collected by Bodil corresponds to the plague period.
2. The information from the TABVERK database includes the population size for parishes in the posterior years of the plague.
3. The geographical information (polygons) for some parishes. This information doesn't correspond to the plague period.

Our goal is to create a unique database for our project: Plague spread across Scania, Sweden, from 1710 to 1715.

First we read the different data sources (.xlsx, .csv, and .shp files)

In [15]:
allParishesScania_path = '/Users/polislizarralde/PythonPlayground/docs/PlagueProject/data/allParishesScania.xlsx'
allParishesScania = pd.read_excel(allParishesScania_path)

Transforming the lowercase to uppercase and checking the type

In [5]:
allParishesScania = allParishesScania.apply(lambda x: x.astype(str).str.upper())
allParishesScania.head()
type(allParishesScania)

pandas.core.frame.DataFrame

Defining a function for extracting the names of the parishes in the data frame

In [16]:
def get_Names(data: pd.DataFrame, heading:str) -> list:
    return data[heading].tolist()   

len(get_Names(allParishesScania, 'Parish Name')) 

398

Filtering the data frame by region and then get the names of the parishes:

In [18]:
southeastParishes = allParishesScania.loc[allParishesScania['Region'] == 'SOUTHEAST']
southeastParishes_names = get_Names(southeastParishes, 'Parish Name')

Reading the census file:

In [19]:
census_path = '/Users/polislizarralde/Desktop/CensusScania/FILE01_FALD.csv'
censusSweden = pd.read_csv(census_path, sep=';')
censusSweden.shape

(102360, 50)

Checking the memory usage

In [12]:
censusSweden.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102360 entries, 0 to 102359
Data columns (total 50 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   LANGEN       102360 non-null  int64  
 1   LANGENNMN    102360 non-null  object 
 2   GEOID        102360 non-null  int64  
 3   GEOIDNMN     102360 non-null  object 
 4   GEOIDTYP     102360 non-null  int64  
 5   AR           102360 non-null  int64  
 6   FORMNR       102360 non-null  int64  
 7   KON          102360 non-null  int64  
 8   FORMID       102360 non-null  int64  
 9   ALD00        102360 non-null  int64  
 10  ALD01        102360 non-null  int64  
 11  ALD03        102360 non-null  int64  
 12  ALD03_1      102360 non-null  int64  
 13  ALD05        102360 non-null  int64  
 14  ALD05_2      102360 non-null  int64  
 15  ALD10        102360 non-null  int64  
 16  ALD15        102360 non-null  int64  
 17  ALD15_1      102360 non-null  int64  
 18  ALD15_2      102360 non-

encriptar aes con python

In [13]:
censusSweden.columns

Index(['LANGEN', 'LANGENNMN', 'GEOID', 'GEOIDNMN', 'GEOIDTYP', 'AR', 'FORMNR',
       'KON', 'FORMID', 'ALD00', 'ALD01', 'ALD03', 'ALD03_1', 'ALD05',
       'ALD05_2', 'ALD10', 'ALD15', 'ALD15_1', 'ALD15_2', 'ALD20', 'ALD25',
       'ALD30', 'ALD35', 'ALD40', 'ALD45', 'ALD50', 'ALD55', 'ALD60',
       'ALD60_1', 'ALD60_2', 'ALD65', 'ALD70', 'ALD75', 'ALD80', 'ALD85',
       'ALD90', 'ALD90_N', 'ALD95', 'ALD101_N', 'ALD100', 'ALD101', 'ALD102',
       'ALD103', 'ALD104', 'ALD105_N', 'BEF_SUM', 'BEF_TOT', 'BEF_GENSUM',
       'BEF_GENTOT', 'Unnamed: 49'],
      dtype='object')

In [None]:
censusSweden = pd.read_csv(census_path, sep=';', usecols=[
                           'LANGEN', 'LANGENNMN', 'GEOID', 'GEOIDNMN', 'GEOIDTYP', 'AR', 'FORMNR', 'KON', 'FORMID', 'BEF_SUM', 'BEF_TOT', 'BEF_GENSUM', 'BEF_GENTOT'])
