In [1]:
import requests
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Define the API endpoint URL
url = "https://soilwise-he.containers.wur.nl/cat/collections/metadata:main/items"

# list to gather all data through API requests
soilwise_data_json = []

#configuration
limit = 50 # the max items the API provides is 50

# loop through pages to retreive next part of data
for i in range(0,10):
    print(f'loop num {i}')
    offset = (limit*i) # extract next 50 items
    params = {'limit':limit, 
              'offset': offset,
             } 
    headers = {"Accept": "application/json"}
    
    # Make the API request
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
         # Parse the JSON response
        soilwise_data_json = soilwise_data_json + data['features']
        print(f"id of first item extracted in this loop is {data['features'][0]['id']}")
    else:
        print(f"Failed to retrieve data: {response.status_code}")

soilwise_data_json[0]

loop num 0
id of first item extracted in this loop is 2e755407-6fab-4b3f-832d-fa76ea535ab0
loop num 1
id of first item extracted in this loop is 0b88bd89-2f3d-4005-ac15-2c8c0c6ef614
loop num 2
id of first item extracted in this loop is 16111ae4-69ba-477f-a6aa-b6fc4c9c5d58
loop num 3
id of first item extracted in this loop is 1f9cac1e-ac71-4007-bec1-bdf979fa4403
loop num 4
id of first item extracted in this loop is 2a3d1b59-350c-4db6-a28c-ed31407c1fa4
loop num 5
id of first item extracted in this loop is 362a01d3-3af1-400d-9436-840779995a4d
loop num 6
id of first item extracted in this loop is 417ff2b0-229d-421f-9c41-25e838090eed
loop num 7
id of first item extracted in this loop is 4dd7c40f-0fb9-4229-aaee-408d118856f0
loop num 8
id of first item extracted in this loop is 58d1aa0b-864a-4cf7-98dc-5915ba659aa8
loop num 9
id of first item extracted in this loop is 6409f4c7-0683-4471-b8f6-17b6dc8b2291


{'id': '2e755407-6fab-4b3f-832d-fa76ea535ab0',
 'type': 'Feature',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[9.14, 53.11],
    [9.14, 54.96],
    [10.77, 54.96],
    [10.77, 53.11],
    [9.14, 53.11]]]},
 'time': '2024-06-20',
 'properties': {'themes': [{'scheme': None, 'concepts': []},
   {'scheme': None, 'concepts': []}],
  'license': 'CC BY',
  'updated': '2024-06-20',
  'type': 'service',
  'created': '2020-03-06',
  'language': 'eng',
  'title': "WMS Service of the dataset 'V140 Kiel: Geographical reference and description of trial plots'",
  'description': "This WMS  Service includes spatial information used by datasets 'WMS Service of the dataset 'V140 Kiel: Geographical reference and description of trial plots''",
  'formats': ['CSV'],
  'keywords': ['infoMapAccessService',
   'Soil',
   'Organic amendments',
   'Geographical information systems',
   'Field experimentation',
   'Farm area',
   'data collection'],
  'providers': [{'name': 'Steffen Rothardt',
    'orga

In [3]:
print(f"total number of items extracted is {len(soilwise_data_json)}")

total number of items extracted is 500


In [4]:
# Extract the relevant fields for each item
extracted_data = []
# iterate through the json and extract the fields needed for each item
for item in soilwise_data_json:
    extracted_item = {
        'id': item['id'],
        'title': item['properties'].get('title', ""), # Use an empty string if field is not available
        'description': item['properties'].get('description', ""), # Use an empty string if field is not available
        'keywords': item['properties'].get('keywords',[]) # Use an empty list if field is not available
    }
    extracted_data.append(extracted_item)

# Convert to DataFrame
soilwise_data_df = pd.DataFrame(extracted_data)

# checking for dupicates:
print(f"number of rows of dataframe is {soilwise_data_df.shape[0]}")
soilwise_data_df = soilwise_data_df.drop_duplicates(subset=['id'])
print(f"number of rows of dataframe after dropping duplicates is {soilwise_data_df.shape[0]}")


soilwise_data_df

number of rows of dataframe is 500
number of rows of dataframe after dropping duplicates is 500


Unnamed: 0,id,title,description,keywords
0,2e755407-6fab-4b3f-832d-fa76ea535ab0,WMS Service of the dataset 'V140 Kiel: Geograp...,This WMS Service includes spatial information...,"[infoMapAccessService, Soil, Organic amendment..."
1,0007bad6-848d-4763-9813-d5ed21cde6ee,Interactive effects of microplastics with othe...,Our study reveals the effects of GCFs on a soi...,"[Soil, microplastics, opendata, Multiple level..."
2,00682004-c6b9-4c1d-8b40-3afff8bbec69,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ..."
3,0086CC52-6F67-4393-99BE-7D3AB1B84160,Bodenübersichtskarte der Bundesrepublik Deutsc...,Die hier vorgestellte Bodenübersichtskarte im ...,"[Boden, Soil, Bodenart, Bodenauslaugung, Boden..."
4,166e8d03-d047-4031-ad12-113c0acb0f60,WMS Service of the dataset 'Impact of biopores...,This WMS Service includes spatial information ...,[infoMapAccessService]
...,...,...,...,...
495,6bf5829e-9fcd-46fc-be6b-6b790c3bfc4a,Raw data X-ray diffraction (XRD) Fe-Al-Hydroxi...,X-ray diffraction pattern were created for the...,"[Soil, Laboratory experimentation, Phosphate f..."
496,6c162e8d-415f-4012-baec-f5e4dfcdc1f7,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ..."
497,6b534803-ce1d-48b3-b330-a8b68cf68234,International long-term experiment (LTE) 'Orga...,Child table of long-term field experiment 'Org...,"[Boden, Langzeitversuch, Landwirtschaft, Versu..."
498,6b94532e-8096-42c8-90f3-bc55a84d9b7a,Plant and arthropod data along gradients in ke...,The data was collected along transects from th...,"[Carabidae, Araneae, vegetation, plants, biodi..."


In [5]:
# new column containing text of three other fields
soilwise_data_df['embedding_text'] = soilwise_data_df.title + " " +soilwise_data_df.description + " " + soilwise_data_df['keywords'].apply(lambda x: ', '.join(x))

soilwise_data_df

Unnamed: 0,id,title,description,keywords,embedding_text
0,2e755407-6fab-4b3f-832d-fa76ea535ab0,WMS Service of the dataset 'V140 Kiel: Geograp...,This WMS Service includes spatial information...,"[infoMapAccessService, Soil, Organic amendment...",WMS Service of the dataset 'V140 Kiel: Geograp...
1,0007bad6-848d-4763-9813-d5ed21cde6ee,Interactive effects of microplastics with othe...,Our study reveals the effects of GCFs on a soi...,"[Soil, microplastics, opendata, Multiple level...",Interactive effects of microplastics with othe...
2,00682004-c6b9-4c1d-8b40-3afff8bbec69,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",SUSALPS temperature and volumetric soil water ...
3,0086CC52-6F67-4393-99BE-7D3AB1B84160,Bodenübersichtskarte der Bundesrepublik Deutsc...,Die hier vorgestellte Bodenübersichtskarte im ...,"[Boden, Soil, Bodenart, Bodenauslaugung, Boden...",Bodenübersichtskarte der Bundesrepublik Deutsc...
4,166e8d03-d047-4031-ad12-113c0acb0f60,WMS Service of the dataset 'Impact of biopores...,This WMS Service includes spatial information ...,[infoMapAccessService],WMS Service of the dataset 'Impact of biopores...
...,...,...,...,...,...
495,6bf5829e-9fcd-46fc-be6b-6b790c3bfc4a,Raw data X-ray diffraction (XRD) Fe-Al-Hydroxi...,X-ray diffraction pattern were created for the...,"[Soil, Laboratory experimentation, Phosphate f...",Raw data X-ray diffraction (XRD) Fe-Al-Hydroxi...
496,6c162e8d-415f-4012-baec-f5e4dfcdc1f7,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",SUSALPS temperature and volumetric soil water ...
497,6b534803-ce1d-48b3-b330-a8b68cf68234,International long-term experiment (LTE) 'Orga...,Child table of long-term field experiment 'Org...,"[Boden, Langzeitversuch, Landwirtschaft, Versu...",International long-term experiment (LTE) 'Orga...
498,6b94532e-8096-42c8-90f3-bc55a84d9b7a,Plant and arthropod data along gradients in ke...,The data was collected along transects from th...,"[Carabidae, Araneae, vegetation, plants, biodi...",Plant and arthropod data along gradients in ke...


In [6]:
# multiple possibilities to convert text into contextual embeddings, current option is using transformers available via HugginFace platform (BERT, DISTILBERT, ... -> free),
# other options are to use contextual-richer LLM's (e.g. paying: GP or free LLama3 -> more resources needed! overkill, added value?)

# Set up with pre-trained model
model_name = 'distilbert-base-uncased'  # other models like 'bert-base-uncased' are also possible

# load tokenizer for selected model
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

# load selected model
try:
    model = AutoModel.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Function to convert text into embeddings, each word is converted into vector and result is mean of all word-vectors. Other approaches are also possible, e.g. on sentence level, taking context of other words in sentence into account
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    # Take the mean of the token/word embeddings
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.detach().numpy()

# Apply the function to the DataFrame
soilwise_data_df['embedding_vector'] = soilwise_data_df['embedding_text'].apply(get_embeddings)

soilwise_data_df



Unnamed: 0,id,title,description,keywords,embedding_text,embedding_vector
0,2e755407-6fab-4b3f-832d-fa76ea535ab0,WMS Service of the dataset 'V140 Kiel: Geograp...,This WMS Service includes spatial information...,"[infoMapAccessService, Soil, Organic amendment...",WMS Service of the dataset 'V140 Kiel: Geograp...,"[[-0.17030187, 0.13650666, 0.2770244, 0.108737..."
1,0007bad6-848d-4763-9813-d5ed21cde6ee,Interactive effects of microplastics with othe...,Our study reveals the effects of GCFs on a soi...,"[Soil, microplastics, opendata, Multiple level...",Interactive effects of microplastics with othe...,"[[-0.39462548, 0.16400608, 0.23973578, 0.03026..."
2,00682004-c6b9-4c1d-8b40-3afff8bbec69,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",SUSALPS temperature and volumetric soil water ...,"[[-0.36373228, 0.178177, 0.31347483, 0.0329990..."
3,0086CC52-6F67-4393-99BE-7D3AB1B84160,Bodenübersichtskarte der Bundesrepublik Deutsc...,Die hier vorgestellte Bodenübersichtskarte im ...,"[Boden, Soil, Bodenart, Bodenauslaugung, Boden...",Bodenübersichtskarte der Bundesrepublik Deutsc...,"[[-0.27584743, 0.15106831, 0.32263365, -0.0882..."
4,166e8d03-d047-4031-ad12-113c0acb0f60,WMS Service of the dataset 'Impact of biopores...,This WMS Service includes spatial information ...,[infoMapAccessService],WMS Service of the dataset 'Impact of biopores...,"[[-0.31326234, 0.0015449612, 0.1717597, 0.0829..."
...,...,...,...,...,...,...
495,6bf5829e-9fcd-46fc-be6b-6b790c3bfc4a,Raw data X-ray diffraction (XRD) Fe-Al-Hydroxi...,X-ray diffraction pattern were created for the...,"[Soil, Laboratory experimentation, Phosphate f...",Raw data X-ray diffraction (XRD) Fe-Al-Hydroxi...,"[[-0.2278534, 0.21750745, 0.27475446, 0.049966..."
496,6c162e8d-415f-4012-baec-f5e4dfcdc1f7,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",SUSALPS temperature and volumetric soil water ...,"[[-0.39330292, 0.18862987, 0.2994763, 0.029022..."
497,6b534803-ce1d-48b3-b330-a8b68cf68234,International long-term experiment (LTE) 'Orga...,Child table of long-term field experiment 'Org...,"[Boden, Langzeitversuch, Landwirtschaft, Versu...",International long-term experiment (LTE) 'Orga...,"[[-0.2252936, 0.007054888, 0.37689325, -0.0574..."
498,6b94532e-8096-42c8-90f3-bc55a84d9b7a,Plant and arthropod data along gradients in ke...,The data was collected along transects from th...,"[Carabidae, Araneae, vegetation, plants, biodi...",Plant and arthropod data along gradients in ke...,"[[-0.23280281, 0.050286908, 0.20093569, 0.1021..."


In [7]:
# Extract the list of embeddings
embeddings = np.vstack(soilwise_data_df['embedding_vector'].values)
embeddings

array([[-0.17030187,  0.13650666,  0.2770244 , ..., -0.35212368,
        -0.05479907, -0.07483386],
       [-0.39462548,  0.16400608,  0.23973578, ..., -0.18875982,
        -0.10465307,  0.11799067],
       [-0.36373228,  0.178177  ,  0.31347483, ..., -0.18288752,
         0.00129264,  0.04782647],
       ...,
       [-0.2252936 ,  0.00705489,  0.37689325, ..., -0.16420749,
         0.00849463,  0.03140154],
       [-0.23280281,  0.05028691,  0.20093569, ..., -0.00432465,
        -0.02672851,  0.12153253],
       [-0.06137275,  0.05484051,  0.07245835, ..., -0.18225572,
         0.03796749, -0.04953733]], dtype=float32)

In [8]:
# Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(embeddings)
cosine_sim_matrix

array([[0.9999999 , 0.83749855, 0.9053575 , ..., 0.9163643 , 0.89826673,
        0.89456457],
       [0.83749855, 1.        , 0.9249915 , ..., 0.8690746 , 0.8709456 ,
        0.8461438 ],
       [0.9053575 , 0.9249915 , 1.        , ..., 0.92494893, 0.92488074,
        0.9110644 ],
       ...,
       [0.9163643 , 0.8690746 , 0.92494893, ..., 1.        , 0.9026931 ,
        0.8937949 ],
       [0.89826673, 0.8709456 , 0.92488074, ..., 0.9026931 , 0.9999997 ,
        0.9351011 ],
       [0.89456457, 0.8461438 , 0.9110644 , ..., 0.8937949 , 0.9351011 ,
        1.0000001 ]], dtype=float32)

In [9]:
# Convert the matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=soilwise_data_df.rename(columns={'id':'id_1'}).id_1, columns=soilwise_data_df.rename(columns={'id':'id_2'}).id_2)
cosine_sim_df

id_2,2e755407-6fab-4b3f-832d-fa76ea535ab0,0007bad6-848d-4763-9813-d5ed21cde6ee,00682004-c6b9-4c1d-8b40-3afff8bbec69,0086CC52-6F67-4393-99BE-7D3AB1B84160,166e8d03-d047-4031-ad12-113c0acb0f60,3418ee89-8331-49a9-b3d2-e14b8fde88be,01da627d-9ad7-4414-9f32-222a139c2407,025b61e6-5e9b-4c03-9e48-fa22be02106f,00bee634-47e6-490b-89ba-2464c9f09c31,0160f3e2-aa36-431c-96f7-871dc41e5f8c,...,6a78a4f7-a704-479c-b4e4-c9d1ee5006a7,6ab178db-0ae7-4220-8408-7462eceacd2c,6b4ca0f5-ec93-44a6-a598-4627ebbe030f,6b50777b-0e75-435d-ba4f-33d5b7478d8c,6b664e1e-15ff-4bcb-8cd6-fef048a653a6,6bf5829e-9fcd-46fc-be6b-6b790c3bfc4a,6c162e8d-415f-4012-baec-f5e4dfcdc1f7,6b534803-ce1d-48b3-b330-a8b68cf68234,6b94532e-8096-42c8-90f3-bc55a84d9b7a,6bf7090c-49ca-4c9a-8bc8-d33eaeadffff
id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2e755407-6fab-4b3f-832d-fa76ea535ab0,1.000000,0.837499,0.905357,0.756249,0.943036,0.964710,0.904812,0.908985,0.878137,0.934925,...,0.915295,0.902793,0.915353,0.901317,0.910968,0.843095,0.902935,0.916364,0.898267,0.894565
0007bad6-848d-4763-9813-d5ed21cde6ee,0.837499,1.000000,0.924991,0.743742,0.862128,0.857295,0.859960,0.861815,0.848999,0.864564,...,0.867913,0.911991,0.855451,0.838591,0.867042,0.875652,0.923659,0.869075,0.870946,0.846144
00682004-c6b9-4c1d-8b40-3afff8bbec69,0.905357,0.924991,1.000000,0.803746,0.887707,0.907745,0.916748,0.909231,0.945141,0.930151,...,0.939148,0.954048,0.915153,0.876680,0.904290,0.901749,0.998693,0.924949,0.924881,0.911064
0086CC52-6F67-4393-99BE-7D3AB1B84160,0.756249,0.743742,0.803746,1.000000,0.750391,0.760891,0.791619,0.846567,0.782612,0.792814,...,0.773110,0.812557,0.861095,0.751292,0.759876,0.811276,0.805562,0.877909,0.780243,0.791499
166e8d03-d047-4031-ad12-113c0acb0f60,0.943036,0.862128,0.887707,0.750391,1.000000,0.948491,0.878893,0.863960,0.845276,0.933692,...,0.870322,0.894061,0.867978,0.878055,0.881746,0.822440,0.887205,0.880287,0.878503,0.872132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6bf5829e-9fcd-46fc-be6b-6b790c3bfc4a,0.843095,0.875652,0.901749,0.811276,0.822440,0.844209,0.881392,0.879292,0.902342,0.857662,...,0.895197,0.894026,0.883159,0.836954,0.868149,1.000000,0.897326,0.898859,0.888531,0.885969
6c162e8d-415f-4012-baec-f5e4dfcdc1f7,0.902935,0.923659,0.998693,0.805562,0.887205,0.907795,0.915765,0.909997,0.941901,0.929610,...,0.936461,0.951050,0.915594,0.874049,0.900258,0.897326,1.000000,0.924791,0.923099,0.907053
6b534803-ce1d-48b3-b330-a8b68cf68234,0.916364,0.869075,0.924949,0.877909,0.880287,0.907364,0.915736,0.978325,0.914095,0.910096,...,0.920034,0.930832,0.988459,0.890047,0.901214,0.898859,0.924791,1.000000,0.902693,0.893795
6b94532e-8096-42c8-90f3-bc55a84d9b7a,0.898267,0.870946,0.924881,0.780243,0.878503,0.893524,0.916185,0.902987,0.911233,0.899124,...,0.910884,0.927460,0.905349,0.896427,0.886446,0.888531,0.923099,0.902693,1.000000,0.935101


In [10]:
# Stack the DataFrame and reset the index
stacked_sim_df = cosine_sim_df.stack().reset_index()
# Rename the columns
stacked_sim_df.columns = ['id_1','id_2','similarity_metric']
# Filter out self-similarity (similarity of items with themselves)
stacked_sim_df = stacked_sim_df[stacked_sim_df['id_1'] != stacked_sim_df['id_2']]
stacked_sim_df = stacked_sim_df.sort_values(by=['similarity_metric'], ascending=False)
stacked_sim_df

Unnamed: 0,id_1,id_2,similarity_metric
238865,68749995-c4bf-4f80-94e5-43c2291c99be,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,1.000000
182977,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,68749995-c4bf-4f80-94e5-43c2291c99be,1.000000
166895,49bebaf8-bae4-4748-8e5c-ce80c0406953,56fcf114-1c1e-46ac-b21a-b43ff7441335,0.999961
197833,56fcf114-1c1e-46ac-b21a-b43ff7441335,49bebaf8-bae4-4748-8e5c-ce80c0406953,0.999961
106863,2c47b34b-dad7-4cc0-baeb-320072589108,54eadc0c-c7d2-42fe-8cda-16daa97be878,0.999960
...,...,...,...
182691,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,285d213f-23e2-47da-b66a-f2f458962b5e,0.597436
182648,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,1e1a3d96-6ac0-4370-8e61-4936ea99674c,0.589482
238648,68749995-c4bf-4f80-94e5-43c2291c99be,1e1a3d96-6ac0-4370-8e61-4936ea99674c,0.589482
74365,1e1a3d96-6ac0-4370-8e61-4936ea99674c,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,0.589482


In [11]:
# merge extra info about title etc.
stacked_sim_df = stacked_sim_df.merge(soilwise_data_df[['id','title','description','keywords']], how='left', left_on='id_1', right_on='id')
stacked_sim_df = stacked_sim_df.merge(soilwise_data_df[['id','title','description','keywords']], how='left', left_on='id_2', right_on='id')
stacked_sim_df = stacked_sim_df.rename(columns={'title_x':'title_1','description_x':'description_1','keywords_x':'keywords_1','title_y':'title_2','description_y':'description_2','keywords_y':'keywords_2'})
stacked_sim_df = stacked_sim_df[['similarity_metric','id_1','title_1','description_1','keywords_1','id_2','title_2','description_2','keywords_2']]
stacked_sim_df

Unnamed: 0,similarity_metric,id_1,title_1,description_1,keywords_1,id_2,title_2,description_2,keywords_2
0,1.000000,68749995-c4bf-4f80-94e5-43c2291c99be,|,|,"[Boden, infoMapAccessService]",5187f8c5-38ef-4b07-bc26-a5e257a8ef59,|,|,"[Boden, infoMapAccessService]"
1,1.000000,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,|,|,"[Boden, infoMapAccessService]",68749995-c4bf-4f80-94e5-43c2291c99be,|,|,"[Boden, infoMapAccessService]"
2,0.999961,49bebaf8-bae4-4748-8e5c-ce80c0406953,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",56fcf114-1c1e-46ac-b21a-b43ff7441335,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ..."
3,0.999961,56fcf114-1c1e-46ac-b21a-b43ff7441335,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",49bebaf8-bae4-4748-8e5c-ce80c0406953,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ..."
4,0.999960,2c47b34b-dad7-4cc0-baeb-320072589108,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ...",54eadc0c-c7d2-42fe-8cda-16daa97be878,SUSALPS temperature and volumetric soil water ...,Grassland is a precious good. Grassland contri...,"[environmental factors, water, Soil analysis, ..."
...,...,...,...,...,...,...,...,...,...
249495,0.597436,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,|,|,"[Boden, infoMapAccessService]",285d213f-23e2-47da-b66a-f2f458962b5e,Servicio WMS (Web Map Service) Inventario Naci...,El Servicio Web de Mapas denominado Inventario...,"[Nacional, 1.3.0, WMS, OGC, infoManagementServ..."
249496,0.589482,5187f8c5-38ef-4b07-bc26-a5e257a8ef59,|,|,"[Boden, infoMapAccessService]",1e1a3d96-6ac0-4370-8e61-4936ea99674c,Inventario Nacional de Erosión de Suelos (2002...,"Este Inventario pretende localizar, cuantifica...","[Suelo, Soil, Nacional, España, erosión]"
249497,0.589482,68749995-c4bf-4f80-94e5-43c2291c99be,|,|,"[Boden, infoMapAccessService]",1e1a3d96-6ac0-4370-8e61-4936ea99674c,Inventario Nacional de Erosión de Suelos (2002...,"Este Inventario pretende localizar, cuantifica...","[Suelo, Soil, Nacional, España, erosión]"
249498,0.589482,1e1a3d96-6ac0-4370-8e61-4936ea99674c,Inventario Nacional de Erosión de Suelos (2002...,"Este Inventario pretende localizar, cuantifica...","[Suelo, Soil, Nacional, España, erosión]",5187f8c5-38ef-4b07-bc26-a5e257a8ef59,|,|,"[Boden, infoMapAccessService]"


In [15]:
# Export the DataFrame to a CSV file
stacked_sim_df.to_csv('duplicate_datasheet_soilwise.csv', sep='\t', index=False)