* [#53 SBL dataset släpps 2024 Strings not Things](https://github.com/salgo60/Svenskaforsamlingar/issues/53)

In [5]:
import requests
import pandas as pd
from io import StringIO

# URL of the file to be fetched
url = "https://filer.riksarkivet.se/registerdata/SBL/csv/SBL_2023.csv"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Use StringIO to read the CSV content into a pandas DataFrame
    csv_content = StringIO(response.text)
    
    # Print the first few lines to inspect the content
    #first_lines = csv_content.getvalue().split('\n')[:5]
    #for line in first_lines:
    #    print(line)
    
    # Reset the StringIO object to start reading from the beginning
    csv_content.seek(0)
    
    # Read the CSV into a DataFrame using the correct delimiter
    df = pd.read_csv(csv_content, delimiter=';')  # Adjust the delimiter if needed

    print("First 5 rows of the DataFrame:")
    print(df.head())  # Display the first 5 rows
    
    print("\nDataFrame info:")
    print(df.info())  # Display concise summary of the DataFrame
    
    print("\nDataFrame description:")
    print(df.describe(include='all'))  # Display summary statistics of the DataFrame

    print("\nColumns in the DataFrame:")
    print(df.columns)  # Display the column names
    
    print("\nData types of each column:")
    print(df.dtypes)  # Display the data types of each column
    

    # Save the DataFrame to a CSV file
    df.to_csv("SBL_2023_local.csv", index=False)
    print("Data has been downloaded, loaded into pandas, and saved as SBL_2023_local.csv")
else:
    print(f"Failed to download file. Status code: {response.status_code}")


First 5 rows of the DataFrame:
   Article id             Svenskt biografiskt lexikon (SBL): URI  \
0        5490  https://sok.riksarkivet.se/sbl/Presentation.as...   
1        5491  https://sok.riksarkivet.se/sbl/Presentation.as...   
2        5492  https://sok.riksarkivet.se/sbl/Presentation.as...   
3        5493  https://sok.riksarkivet.se/sbl/Presentation.as...   
4        5494  https://sok.riksarkivet.se/sbl/Presentation.as...   

  Type of article  SBL volume number  Page number in volume  Surname  \
0  Family article                1.0                    1.0   Abelin   
1  Person article                1.0                    5.0   Abelin   
2  Person article                1.0                    1.0   Abelin   
3  Person article                1.0                    9.0   Abelin   
4  Person article                1.0                   11.0  Abenius   

   Given name Gender      Occupation, royal title, rank  \
0       släkt      -                                NaN   
1    Gust

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9406 entries, 0 to 9405
Data columns (total 47 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Article id                              9406 non-null   int64  
 1   Svenskt biografiskt lexikon (SBL): URI  9406 non-null   object 
 2   Type of article                         9406 non-null   object 
 3   SBL volume number                       9405 non-null   float64
 4   Page number in volume                   9404 non-null   float64
 5   Surname                                 9406 non-null   object 
 6   Given name                              8949 non-null   object 
 7   Gender                                  9406 non-null   object 
 8   Occupation, royal title, rank           7604 non-null   object 
 9   Prefix to year of birth                 53 non-null     object 
 10  Year of birth                           7620 non-null   obje

In [27]:
def inspect_column_values(df, column_name):
    # Check if the specified column exists
    if column_name in df.columns:
        # Get the counts of unique values in the specified column
        value_counts = df[column_name].value_counts()
        
        # Convert the Series to a DataFrame for better display
        value_counts_df = value_counts.reset_index()
        value_counts_df.columns = [column_name, 'Number of Occurrences']
        
        return value_counts_df
    else:
        print(f"The column '{column_name}' does not exist in the DataFrame.")
        return None

def inspect_top_column_values(df, column_name, top_n):
    value_counts_df = inspect_column_values(df, column_name)
    if value_counts_df is not None:
        # Display the top 'n' values
        top_value_counts_df = value_counts_df.head(top_n)
        print(f"Top {top_n} counts of unique values in '{column_name}' column:")
        print(top_value_counts_df)


In [28]:
# Check if the column "Prefix to year of birth" exists
column_name = 'Prefix to year of birth'
inspect_top_column_values(df, column_name,100)

Counts of unique values in 'Prefix to year of birth' column:
       Prefix to year of birth  Number of Occurrences
0                      omkring                     18
1                  troligen pĺ                     10
2                     troligen                      6
3                         omkr                      3
4                       senast                      3
5                    senast ca                      1
6                  i början av                      1
7                     enl uppg                      1
8                      Omkring                      1
9                        efter                      1
10                   slutet av                      1
11                        trol                      1
12          troligen senast pĺ                      1
13  troligen senare hälften av                      1
14              senast omkring                      1
15                   mitten av                      1
16                   

In [29]:
column_name = 'Prefix to year of death'
inspect_top_column_values(df, column_name,100)

Counts of unique values in 'Prefix to year of death' column:
   Prefix to year of death  Number of Occurrences
0                 tidigast                     35
1                 levde pĺ                     26
2                   senast                     25
3                  omkring                     24
4                 troligen                     20
5                    efter                     12
6                     före                      8
7                    levde                      7
8               levde ännu                      6
9            troligen före                      3
10                  mellan                      3
11                    omkr                      2
12              kort efter                      2
13           levde omkring                      2
14               början av                      1
15              levde omkr                      1
16          senast omkring                      1
17          levde troligen             

In [30]:
column_name = 'Type of article'
inspect_top_column_values(df, column_name,10)

Counts of unique values in 'Type of article' column:
  Type of article  Number of Occurrences
0  Person article                   7652
1  Family article                   1754


### odd SBL Volume number 0

In [34]:
column_name = 'SBL volume number'
inspect_top_column_values(df, column_name,4)

Counts of unique values in 'SBL volume number' column:
    SBL volume number  Number of Occurrences
0                16.0                    433
1                17.0                    412
2                18.0                    399
3                21.0                    391
4                19.0                    385
5                20.0                    371
6                22.0                    351
7                23.0                    345
8                24.0                    333
9                25.0                    323
10                1.0                    320
11                2.0                    311
12               11.0                    281
13                3.0                    268
14               27.0                    267
15               29.0                    267
16                6.0                    263
17               26.0                    262
18               13.0                    259
19               28.0                    258


In [35]:
column_name = 'Page number in volume'
inspect_top_column_values(df, column_name,40)

Counts of unique values in 'Page number in volume' column:
     Page number in volume  Number of Occurrences
0                      1.0                     44
1                    209.0                     24
2                    301.0                     22
3                     23.0                     21
4                    589.0                     20
..                     ...                    ...
797                  797.0                      2
798                  800.0                      1
799                  801.0                      1
800                  799.0                      1
801                  804.0                      1

[802 rows x 2 columns]


In [21]:
def inspect_top_column_values(df, column_name, top_n):
    value_counts_df = inspect_column_values(df, column_name)
    if value_counts_df is not None:
        # Display the top 'n' values
        top_value_counts_df = value_counts_df.head(top_n)
        print(f"Top {top_n} counts of unique values in '{column_name}' column:")
        print(top_value_counts_df)


In [23]:
# Specify the column name to inspect
column_name = 'Surname'
top_n = 40  # Number of top values to display
    
# Call the function with the DataFrame and column name
inspect_top_column_values(df, column_name, top_n)

Counts of unique values in 'Surname' column:
                     Surname  Number of Occurrences
0                    Nilsson                     55
1                   Svensson                     44
2                  Johansson                     43
3                      Ekman                     37
4                       Berg                     28
...                      ...                    ...
4703                 Murberg                      1
4704                  Hjärre                      1
4705                  Hjörne                      1
4706                    Hoas                      1
4707  Bure och Bureus, släkt                      1

[4708 rows x 2 columns]


In [38]:
column_name = "Curriculum vitae"
inspect_top_column_values(df, column_name, 100)

Counts of unique values in 'Curriculum vitae' column:
                                       Curriculum vitae  Number of Occurrences
0                                                                          724
1     7. Fredrik von Essen, [a:15503:den föregĺendes...                      1
2     Eugénie (Charlotta Eugénie Augusta Amalia Albe...                      1
3     Eugen Napoleon Nicolaus, Sveriges (till 1905 ä...                      1
4     Eufemia Eriksdotter, svensk prinsessa, f. omkr...                      1
...                                                 ...                    ...
7571  Lindvall, Carl August, f 14 dec 1829 i Karlskr...                      1
7572  Lindström, Nils Ĺke Johannes, f 13 (enl fb 3) ...                      1
7573  Lindström, Gustaf Hilding Sigfrid, f 19 april ...                      1
7574  Lindström, Gustaf Rune, f 28 april 1916 i Väst...                      1
7575  Söderberg, Hjalmar Emil Fredrik, f 2 jun 1869 ...                      