In [5]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, XSD
import re

In [6]:
# Load CSV
file_path = r"data\coffees_categorized\all_coffees_categorized.csv"
df = pd.read_csv(file_path)

# Define namespaces
COFFEE = Namespace("http://coffee-quality.org/ontology/")
DATA = Namespace("http://coffee-quality.org/data/")

# Create RDF graph
g = Graph()
g.bind("coffee", COFFEE)
g.bind("data", DATA)
g.bind("xsd", XSD)

def clean_uri_component(text):
    """Clean text for use in URIs"""
    if pd.isna(text):
        return "Unknown"
    text = str(text).strip()
    # Replace spaces and special chars with underscores
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'[\s_]+', '_', text)
    return text

def get_country_uri(country):
    """Map country names to ontology URIs"""
    mapping = {
        'Mexico': COFFEE.Mexico,
        'Brazil': COFFEE.Brazil,
        'Guatemala': COFFEE.Guatemala,
        'Peru': COFFEE.Peru,
        'China': COFFEE.China,
        'Costa Rica': COFFEE.CostaRica,
        'Uganda': COFFEE.Uganda,
        'Taiwan': COFFEE.Taiwan,
        'Tanzania': COFFEE.Tanzania,
        'Kenya': COFFEE.Kenya,
        'Colombia': COFFEE.Colombia,
        'Panama': COFFEE.Panama,
        'El Salvador': COFFEE.ElSalvador,
        'Indonesia': COFFEE.Indonesia,
        'Honduras': COFFEE.Honduras,
        'United States (Hawaii)': COFFEE.UnitedStatesHawaii,
        'United States (Puerto Rico)': COFFEE.UnitedStatesPuertoRico,
        'Nicaragua': COFFEE.Nicaragua,
        'Ethiopia': COFFEE.Ethiopia
    }
    return mapping.get(country, COFFEE.Unknown)

def get_color_uri(color):
    """Map color values to ontology URIs"""
    mapping = {
        'green': COFFEE.Green,
        'blue-green': COFFEE.BlueGreen,
        'yellow': COFFEE.Yellow,
        'yellow-green': COFFEE.YellowGreen,
        'brown': COFFEE.Brown,
        'brown-green': COFFEE.BrownGreen,
        'unknown': COFFEE.Unknown
    }
    return mapping.get(str(color).lower(), COFFEE.Unknown)

def get_processing_uri(method):
    """Map processing methods to ontology URIs"""
    mapping = {
        'Washed / Wet': COFFEE.WashedWet,
        'Natural / Dry': COFFEE.NaturalDry,
        'Semi-washed / Semi-pulped': COFFEE.SemiWashed,
        'Pulped natural / honey': COFFEE.PulpedNaturalHoney,
        'Other': COFFEE.OtherProcessing,
        'Unknown': COFFEE.Unknown
    }
    return mapping.get(method, COFFEE.Unknown)

# Process each row
for idx, row in df.iterrows():
    # Create unique URI for each coffee lot
    lot_uri = DATA[f"lot_{idx}"]
    
    # Declare it as a CoffeeLot
    g.add((lot_uri, RDF.type, COFFEE.CoffeeLot))
    
    # Basic properties
    g.add((lot_uri, COFFEE.hasSpecies, Literal(row['Species'], datatype=XSD.string)))
    g.add((lot_uri, COFFEE.fromCountry, get_country_uri(row['Country_of_Origin'])))
    g.add((lot_uri, COFFEE.harvestYear, Literal(str(row['Harvest_Year']), datatype=XSD.string)))
    g.add((lot_uri, COFFEE.hasColor, get_color_uri(row['Color'])))
    g.add((lot_uri, COFFEE.hasProcessingMethod, get_processing_uri(row['Processing_Method'])))
    
    # Quality scores (as decimals)
    g.add((lot_uri, COFFEE.aromaScore, Literal(row['Aroma'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.flavorScore, Literal(row['Flavor'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.aftertasteScore, Literal(row['Aftertaste'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.acidityScore, Literal(row['Acidity'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.bodyScore, Literal(row['Body'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.balanceScore, Literal(row['Balance'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.uniformityScore, Literal(row['Uniformity'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.cleanCupScore, Literal(row['Clean_Cup'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.sweetnessScore, Literal(row['Sweetness'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.cupperPoints, Literal(row['Cupper_Points'], datatype=XSD.decimal)))
    
    # Overall quality metrics
    g.add((lot_uri, COFFEE.totalCupPoints, Literal(row['Total_Cup_Points'], datatype=XSD.decimal)))
    g.add((lot_uri, COFFEE.qualityCategory, Literal(row['Quality_Category'], datatype=XSD.integer)))
    
    # Taste categories string
    g.add((lot_uri, COFFEE.tasteCategories, Literal(row['Taste_Categories'], datatype=XSD.string)))
    
    # Quality label booleans
    g.add((lot_uri, COFFEE.hasHighQuality, Literal(row['Quality_Label_High Quality'], datatype=XSD.boolean)))
    g.add((lot_uri, COFFEE.hasMediumQuality, Literal(row['Quality_Label_Medium Quality'], datatype=XSD.boolean)))
    g.add((lot_uri, COFFEE.hasLowQuality, Literal(row['Quality_Label_Low Quality'], datatype=XSD.boolean)))
    
    # Taste category binary indicators
    g.add((lot_uri, COFFEE.isAromatic, Literal(row['Category_Aromatic'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isComplex, Literal(row['Category_Complex'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isFullBodied, Literal(row['Category_Full-bodied'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isLingering, Literal(row['Category_Lingering'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isSmooth, Literal(row['Category_Smooth'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isSour, Literal(row['Category_Sour'], datatype=XSD.integer)))
    g.add((lot_uri, COFFEE.isSweet, Literal(row['Category_Sweet'], datatype=XSD.integer)))

# Save as Turtle format
g.serialize(destination='coffee_data.ttl', format='turtle')
print(f"Successfully converted {len(df)} coffee lots to RDF")
print(f"Total triples: {len(g)}")

# Optional: Also save as RDF/XML for compatibility
g.serialize(destination='coffee_data.rdf', format='xml')
print("Also saved as RDF/XML format")

Successfully converted 1517 coffee lots to RDF
Total triples: 43993
Also saved as RDF/XML format


In [4]:
df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1517 entries, 0 to 1516
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Species                       1517 non-null   object 
 1   Country_of_Origin             1517 non-null   object 
 2   Harvest_Year                  1517 non-null   object 
 3   Color                         1517 non-null   object 
 4   Processing_Method             1517 non-null   object 
 5   Aroma                         1517 non-null   float64
 6   Flavor                        1517 non-null   float64
 7   Aftertaste                    1517 non-null   float64
 8   Acidity                       1517 non-null   float64
 9   Body                          1517 non-null   float64
 10  Balance                       1517 non-null   float64
 11  Uniformity                    1517 non-null   float64
 12  Clean_Cup                     1517 non-null   float64
 13  Swe

Unnamed: 0,Species,Country_of_Origin,Harvest_Year,Color,Processing_Method,Aroma,Flavor,Aftertaste,Acidity,Body,...,Quality_Label_High Quality,Quality_Label_Low Quality,Quality_Label_Medium Quality,Category_Aromatic,Category_Complex,Category_Full-bodied,Category_Lingering,Category_Smooth,Category_Sour,Category_Sweet
0,Arabica,Ethiopia,2014,green,Washed / Wet,8.67,8.83,8.67,8.75,8.5,...,True,False,False,1,1,1,1,0,1,1
1,Arabica,Ethiopia,2014,green,Washed / Wet,8.75,8.67,8.5,8.58,8.42,...,True,False,False,1,1,1,1,0,1,1
2,Arabica,Guatemala,Unknown,unknown,Unknown,8.42,8.5,8.42,8.42,8.33,...,True,False,False,1,1,1,1,0,1,1
3,Arabica,Ethiopia,2014,green,Natural / Dry,8.17,8.58,8.42,8.42,8.5,...,True,False,False,1,1,1,1,0,1,1
4,Arabica,Ethiopia,2014,green,Washed / Wet,8.25,8.5,8.25,8.5,8.42,...,True,False,False,1,1,1,1,0,1,1
