* [#53 SBL dataset släpps 2024 Strings not Things](https://github.com/salgo60/Svenskaforsamlingar/issues/53)

In [1]:
import time 
from datetime import datetime
start_time = time.time() 
start_datetime = datetime.fromtimestamp(start_time) 
print("Start time:", start_datetime.strftime('%Y-%m-%d %H:%M:%S'))

Start time: 2024-05-14 13:50:12


In [2]:
import requests
import pandas as pd
from io import StringIO

# URL of the file to be fetched
url = "https://filer.riksarkivet.se/registerdata/SBL/csv/SBL_2023.csv"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Use StringIO to read the CSV content into a pandas DataFrame
    csv_content = StringIO(response.text)
    
    # Print the first few lines to inspect the content
    #first_lines = csv_content.getvalue().split('\n')[:5]
    #for line in first_lines:
    #    print(line)
    
    # Reset the StringIO object to start reading from the beginning
    csv_content.seek(0)
    
    # Read the CSV into a DataFrame using the correct delimiter
    df = pd.read_csv(csv_content, delimiter=';')  # Adjust the delimiter if needed

    print("First 5 rows of the DataFrame:")
    print(df.head())  # Display the first 5 rows
    
    print("\nDataFrame info:")
    print(df.info())  # Display concise summary of the DataFrame
    
    print("\nDataFrame description:")
    print(df.describe(include='all'))  # Display summary statistics of the DataFrame

    print("\nColumns in the DataFrame:")
    print(df.columns)  # Display the column names
    
    print("\nData types of each column:")
    print(df.dtypes)  # Display the data types of each column
    

    # Save the DataFrame to a CSV file
    df.to_csv("SBL_2023_local.csv", index=False)
    print("Data has been downloaded, loaded into pandas, and saved as SBL_2023_local.csv")
else:
    print(f"Failed to download file. Status code: {response.status_code}")


First 5 rows of the DataFrame:
   Article id             Svenskt biografiskt lexikon (SBL): URI  \
0        5490  https://sok.riksarkivet.se/sbl/Presentation.as...   
1        5491  https://sok.riksarkivet.se/sbl/Presentation.as...   
2        5492  https://sok.riksarkivet.se/sbl/Presentation.as...   
3        5493  https://sok.riksarkivet.se/sbl/Presentation.as...   
4        5494  https://sok.riksarkivet.se/sbl/Presentation.as...   

  Type of article  SBL volume number  Page number in volume  Surname  \
0  Family article                1.0                    1.0   Abelin   
1  Person article                1.0                    5.0   Abelin   
2  Person article                1.0                    1.0   Abelin   
3  Person article                1.0                    9.0   Abelin   
4  Person article                1.0                   11.0  Abenius   

   Given name Gender      Occupation, royal title, rank  \
0       släkt      -                                NaN   
1    Gust

In [3]:
def inspect_column_values(df, column_name):
    # Check if the specified column exists
    if column_name in df.columns:
        # Get the counts of unique values in the specified column
        value_counts = df[column_name].value_counts()
        
        # Convert the Series to a DataFrame for better display
        value_counts_df = value_counts.reset_index()
        value_counts_df.columns = [column_name, 'Number of Occurrences']
        
        return value_counts_df
    else:
        print(f"The column '{column_name}' does not exist in the DataFrame.")
        return None

def inspect_top_column_values(df, column_name, top_n):
    value_counts_df = inspect_column_values(df, column_name)
    if value_counts_df is not None:
        # Display the top 'n' values
        top_value_counts_df = value_counts_df.head(top_n)
        print(f"Top {top_n} counts of unique values in '{column_name}' column:")
        print(top_value_counts_df)


In [4]:
# Check if the column "Prefix to year of birth" exists
column_name = 'Prefix to year of birth'
inspect_top_column_values(df, column_name,100)

Top 100 counts of unique values in 'Prefix to year of birth' column:
       Prefix to year of birth  Number of Occurrences
0                      omkring                     18
1                  troligen pĺ                     10
2                     troligen                      6
3                         omkr                      3
4                       senast                      3
5                    senast ca                      1
6                  i början av                      1
7                     enl uppg                      1
8                      Omkring                      1
9                        efter                      1
10                   slutet av                      1
11                        trol                      1
12          troligen senast pĺ                      1
13  troligen senare hälften av                      1
14              senast omkring                      1
15                   mitten av                      1
16           

In [5]:
column_name = 'Prefix to year of death'
inspect_top_column_values(df, column_name,100)

Top 100 counts of unique values in 'Prefix to year of death' column:
   Prefix to year of death  Number of Occurrences
0                 tidigast                     35
1                 levde pĺ                     26
2                   senast                     25
3                  omkring                     24
4                 troligen                     20
5                    efter                     12
6                     före                      8
7                    levde                      7
8               levde ännu                      6
9            troligen före                      3
10                  mellan                      3
11                    omkr                      2
12              kort efter                      2
13           levde omkring                      2
14               början av                      1
15              levde omkr                      1
16          senast omkring                      1
17          levde troligen     

In [6]:
column_name = 'Type of article'
inspect_top_column_values(df, column_name,10)

Top 10 counts of unique values in 'Type of article' column:
  Type of article  Number of Occurrences
0  Person article                   7652
1  Family article                   1754


### odd SBL Volume number 0

In [7]:
column_name = 'SBL volume number'
inspect_top_column_values(df, column_name,4)

Top 4 counts of unique values in 'SBL volume number' column:
   SBL volume number  Number of Occurrences
0               16.0                    433
1               17.0                    412
2               18.0                    399
3               21.0                    391


In [8]:
column_name = 'Page number in volume'
inspect_top_column_values(df, column_name,40)

Top 40 counts of unique values in 'Page number in volume' column:
    Page number in volume  Number of Occurrences
0                     1.0                     44
1                   209.0                     24
2                   301.0                     22
3                    23.0                     21
4                   589.0                     20
5                   497.0                     20
6                   295.0                     19
7                   182.0                     19
8                   439.0                     19
9                   456.0                     19
10                   43.0                     18
11                  354.0                     18
12                   69.0                     18
13                  260.0                     18
14                  339.0                     18
15                  109.0                     18
16                   71.0                     18
17                  111.0                     18
18 

In [9]:
def inspect_top_column_values(df, column_name, top_n):
    value_counts_df = inspect_column_values(df, column_name)
    if value_counts_df is not None:
        # Display the top 'n' values
        top_value_counts_df = value_counts_df.head(top_n)
        print(f"Top {top_n} counts of unique values in '{column_name}' column:")
        print(top_value_counts_df)


In [10]:
# Specify the column name to inspect
column_name = 'Surname'
top_n = 40  # Number of top values to display
    
# Call the function with the DataFrame and column name
inspect_top_column_values(df, column_name, top_n)

Top 40 counts of unique values in 'Surname' column:
         Surname  Number of Occurrences
0        Nilsson                     55
1       Svensson                     44
2      Johansson                     43
3          Ekman                     37
4           Berg                     28
5      Andersson                     27
6       Lindberg                     25
7         Olsson                     24
8      Lindström                     23
9        Larsson                     21
10     Kjellberg                     19
11        Mörner                     19
12      Ericsson                     19
13       Bergman                     18
14         Bonde                     18
15       Fleming                     18
16        Bielke                     18
17       Persson                     18
18     Söderberg                     18
19  De la Gardie                     17
20      Lundgren                     17
21          Dahl                     16
22          Horn            

In [11]:
column_name = "Curriculum vitae"
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Curriculum vitae' column:
                                     Curriculum vitae  Number of Occurrences
0                                                                        724
1   7. Fredrik von Essen, [a:15503:den föregĺendes...                      1
2   Eugénie (Charlotta Eugénie Augusta Amalia Albe...                      1
3   Eugen Napoleon Nicolaus, Sveriges (till 1905 ä...                      1
4   Eufemia Eriksdotter, svensk prinsessa, f. omkr...                      1
..                                                ...                    ...
95  Alexander Eriskein (von Erskein, Ersskein), ti...                      1
96  Ernst (Ernest), Petter, f. 2 april 1714, d. 19...                      1
97  1. Per Henrik Gustaf Peterson, f. 23 mars 1815...                      1
98  3. Jarl Ludvig Ernberg, f. 20 aug. 1863 i Karl...                      1
99  2. Axel Ernberg, [a:15446:den föregĺendes] bro...                      1

[100 rows x 2

In [12]:
column_name = "Gender"
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Gender' column:
  Gender  Number of Occurrences
0      m                   7066
1      -                   1805
2      f                    534
3      M                      1


In [13]:
column_name= "Occupation, royal title, rank" 
inspect_top_column_values(df, column_name, 10)

Top 10 counts of unique values in 'Occupation, royal title, rank' column:
  Occupation, royal title, rank  Number of Occurrences
0                   Arméofficer                    144
1                         Präst                    114
2                        Biskop                    100
3                        Läkare                     91
4                    Författare                     84
5                        Mĺlare                     74
6                          Kung                     71
7                      Arkitekt                     68
8                  Skĺdespelare                     64
9                Industriidkare                     63


In [14]:
column_name="Place of birth" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Place of birth' column:
                    Place of birth  Number of Occurrences
0                  Stockholms stad                    435
1                         Tyskland                    223
2                          Finland                    222
3                 Klara församling                    121
4       Uppsala domkyrkoförsamling                    118
..                             ...                    ...
95           Brännkyrka församling                      9
96               Avesta församling                      9
97                 Lovö församling                      8
98     Trelleborgs stadsförsamling                      8
99  Östra Ryds församling (AB-län)                      8

[100 rows x 2 columns]


In [15]:
column_name="Comment on place of birth"
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Comment on place of birth' column:
   Comment on place of birth  Number of Occurrences
0                      i Ĺbo                     20
1                i Stralsund                     18
2              i Helsingfors                     16
3                    i Paris                     14
4                i Köpenhamn                     12
..                       ...                    ...
95       pĺ Herrevadskloster                      2
96               i Kockstorp                      2
97             pĺ Voxna bruk                      2
98                pĺ Sĺtenäs                      2
99              pĺ Claestorp                      2

[100 rows x 2 columns]


In [16]:
column_name="Place of birth (physical location)" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Place of birth (physical location)' column:
   Place of birth (physical location)  Number of Occurrences
0                     Stockholms stad                      3
1                            Tyskland                      2
2                             Finland                      1
3                Ransäters församling                      1
4                   Jakobs församling                      1
5       Stockholms domkyrkoförsamling                      1
6        Stora Kopparbergs församling                      1
7                   Hasslö församling                      1
8                 Lekaryds församling                      1
9                    Klara församling                      1
10                   Vrena församling                      1
11                  Högsjö församling                      1
12          Trolle-Ljungby församling                      1
13                   Solna församling                      1
14   

In [17]:
column_name="Comment on place of death" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Comment on place of death' column:
     Comment on place of death  Number of Occurrences
0                        i Ĺbo                     47
1                  i Djursholm                     35
2                      i Paris                     28
3                  i Köpenhamn                     17
4                        i Rom                     13
..                         ...                    ...
95                      i Bern                      2
96                   i Pommern                      2
97                i Heidelberg                      2
98  i Los Angeles, Kalifornien                      2
99                    i Altona                      2

[100 rows x 2 columns]


In [18]:
column_name="Archive" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Archive' column:
                                              Archive  Number of Occurrences
0                                                                       2054
1                                   Brev frĺn S i KB.                      5
2                     Strödda brev frĺn L i KB o UUB.                      4
3                          Strödda brev frĺn L i UUB.                      3
4                              Brev frĺn M i KB o RA.                      3
..                                                ...                    ...
95  En saml brev till H i KB. Brev frĺn H i RA (bl...                      1
96  Egenhändiga nedskrifter av H :s dikter saknas ...                      1
97  Delar av H:s arkiv i LUB (bl a brev frĺn T Hol...                      1
98  H:s arkiv (brev till honom, predikn:ar5 i LUB....                      1
99                                  Brev frĺn H i KB.                      1

[100 rows x 2 columns]

In [19]:
column_name="Printed works" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Printed works' column:
                                        Printed works  Number of Occurrences
0                                                                       1736
1                            Handskrifter: se texten.                      3
2   Tryckta arbeten: Anföranden och motioner i and...                      3
3                        Tryckta skrifter: se texten.                      3
4   Tryckta arbeten: De s. k. »Skevikstraktaterna»...                      2
..                                                ...                    ...
95  Tryckta arbeten: F:s brev av Malmö 16 juni 167...                      1
96  Handskrifter, källeditioner och allmän littera...                      1
97  Tryckta arbeten: Til Hans Kongl. Höghet Kron-p...                      1
98  Tryckta arbeten: Brev till okänd i Tyskland (2...                      1
99  Tryckt arbete: Tal, hĺllit wid... Carl Hĺrlema...                      1

[100 rows x 2 co

In [20]:
column_name="Sources" 
inspect_top_column_values(df, column_name, 100)


Top 100 counts of unique values in 'Sources' column:
                                              Sources  Number of Occurrences
0                                                                         79
1             Källor: jfr ovan under släktöversikten.                      4
2    Källa: G. Elgenstierna, Sv. släktkalendern 1918.                      3
3                Källor: Se ovan under Donner, släkt.                      3
4                      Källa: Personliga meddelanden.                      2
..                                                ...                    ...
95  Källor: ovan anförda Fersenska arkiv. – Handl....                      1
96  Källor: F:s ovannämnda papper i RA; Klara förs...                      1
97  Källor: F:s papper och anförda brev. – Krigsex...                      1
98  Källor: Ovan angivna handskrifter; Görvel ĺker...                      1
99  Källor: F:s egna skrifter samt ovan nämnda ark...                      1

[100 rows x 2 columns]

In [21]:
column_name="Article author" 
inspect_top_column_values(df, column_name, 100)


Top 100 counts of unique values in 'Article author' column:
       Article author  Number of Occurrences
0   Bengt Hildebrand.                    337
1                                        220
2    Hans Gillingstam                    173
3               H G-m                    168
4        Olle Franzén                    165
..                ...                    ...
95     Olle Hellström                     11
96     Anders Jarlert                     11
97        Brita Linde                     11
98    Lennart Hedwall                     11
99    Staffan Högberg                     11

[100 rows x 2 columns]


In [22]:
column_name="Image file 9" 
inspect_top_column_values(df, column_name, 100)

Top 100 counts of unique values in 'Image file 9' column:
                                         Image file 9  Number of Occurrences
0   https://sok.riksarkivet.se/sbl/bilder/6973_8_0...                      1
1                                  godsägarekammarrĺd                      1
2                                                   H                      1
3         Eric Gabriel L (1782—1848) överstelöj tnant                      1
4                                          -52)(1692—                      1
5                                                LILL                      1
6                                        L(1796-1858)                      1
7   1 Gustaf Napoleon M (1829-1901)    1 major, di...                      1
8      1 1 Gustaf N (1816-1902) överste, förestĺndare                      1
9                                     grosshandlare o                      1
10  lOtto Gustar Nordensköld(1780-1861) amiral1841...                      1
11                

In [23]:
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 7.7952728271484375 seconds
