In [1]:
import spacy
import re

In [2]:
nlp = spacy.load("model_shareable/coffee_ner_blank_en_1")

In [3]:
def return_each_label(text):
    doc = nlp(text.lower())

    return_dict = {}
    
    for ent in doc.ents:
        # print(ent.label_)
        if ent.label_ not in return_dict.keys():
            return_dict[ent.label_] = [ent.text]
        else:
            return_dict[ent.label_].append(ent.text)
    return return_dict

In [4]:
# copied from my coffee repo
def text_cleaner(input_string):
    # Step 1: Convert to lowercase
    if not input_string:
        return None 
    if input_string=='Not available' or input_string=='Blue Tokai Coffee Roasters':
        return None

    cleaned = input_string.lower()
    
    # Step 2: Remove specific unwanted strings like 'nbsp', 'NBSP'
    cleaned = re.sub(r'(\bnbsp\b|\bNBSP\b)', '', cleaned)
    cleaned = cleaned.replace('grown at 4200 msl','')
    cleaned = cleaned.replace('&amp','')
    cleaned = cleaned.replace('&#39;',"'")


    # Step 3: Normalize multiple spaces, newlines, and punctuation marks to a single one
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Replace multiple spaces/newlines with a single space
    cleaned = re.sub(r'([.,!?&|;:()\[\]{}-])\1+', r'\1', cleaned)  # Replace multiple punctuation marks with a single one
    
    # Step 4: Ensure there is always a space before and after punctuation (except single quotes)
    cleaned = re.sub(r"([.,!?&;:()\[\]{}'\"-])", r" \1 ", cleaned)  # Space around punctuation

    cleaned = re.sub(r'\s+', ' ', cleaned)  # Remove any additional spaces created
    cleaned = cleaned.strip()     # Remove any additional space at end or start

    # Step 5: Remove punctuation at the start or end of the line
    cleaned = re.sub(r'^[.,!?&;:\[\]{}-]+|[.,!?&;:\[\]{}-]+$', '', cleaned)

    # custom

    # removing this specific set coz problems in NER
    # cleaned = re.sub(r"\s'\s", "'", cleaned)
    # cleaned = re.sub(r'\s"\s', '"', cleaned)
    # cleaned = re.sub(r"\s-\s", "-", cleaned)


    # Step 6: Final strip to remove leading/trailing whitespace
    cleaned = cleaned.strip()

    return cleaned

In [5]:
text = """
Origin: Kerehaklu Estate, Aldur, Chikmagalur.
A single estate, naturally processed, city roast coffee. Grown at an average elevation of 1200masl.

Biodiversity: Block frequented by bison, barking deer, green pigeon and whistling schoolboy bird.

Process: Natural – Genetics: S6 Arabica – Block: Huli Bande Patte – Roast profile: City Roast – Drying: 31 days outside on raised beds followed by polyhouse – Ageing: in liner sacks for 40 days before milling

Sensory Experience: The essence of forest berries intertwined with sweet spices. A velvety sip that reveals the gradual embrace of rich milk chocolate, leaving behind a lingering aftertaste that caresses your palate with its buttery texture. Its bold body and invigorating acidity makes Pranoy’s Pride a harmonious and truly satisfying cup.

Terroir: This coffee flourishes amidst the lush indigenous Kerehaklu forest. The estate spans 270 acres in the Western Ghats, emphasizing biodiversity conservation.

Transform your coffee ritual: into an extraordinary sensory journey by immersing yourself in the soundscape by Anandit Sachdev, exclusively designed to complement the unique essence of this exceptional coffee! 

https://marcscoffeesmusicprogram.bandcamp.com/track/droplets
"""

In [6]:
return_each_label(text_cleaner(text))

{'ESTATE': ['kerehaklu', 'kerehaklu'],
 'LOCATION': ['chikmagalur'],
 'NAME': ['bison'],
 'PROCESSING': ['natural –'],
 'COFFEE TYPE': ['arabica'],
 'TASTING NOTES': ['milk chocolate'],
 'AFTERTASTE': ['lingering'],
 'ACIDITY': ['invigorating']}

In [7]:
text ="""
Single origin Indian Coffee From Moganad Estate, Tamil Nadu which is a 100% Arabiaca coffee with 
an exquisite blend of balanced sweetness and brightness. A Medium Dark Roast coffee with 
flavor notes of Cocoa, Caramel and Nut which can also be enjoyed in a French press, moka pot, aeropress & espresso .
It is washed processed, grown at an altitude of around 4430 ft
"""

return_each_label(text)

{'ESTATE': ['moganad'],
 'LOCATION': ['tamil nadu'],
 'ROAST LEVEL': ['medium dark'],
 'TASTING NOTES': ['cocoa', 'caramel', 'nut'],
 'PROCESSING': ['washed']}

In [8]:
text="""
tasting notes : toffee , hazelnut , milk chocolate roast level : espresso location-chikmagalur , kn 
altitude- 1400m varietal- selection 13 
process-washed weight-250gm/1kg;
"""
return_each_label(text.lower())

{'TASTING NOTES': ['toffee', 'hazelnut', 'milk chocolate'],
 'ROAST LEVEL': ['espresso'],
 'LOCATION': ['chikmagalur'],
 'ELEVATION': ['1400m'],
 'PROCESSING': ['washed']}

In [9]:
text = """ 

our search for the immaculate espresso began when we were embraced by the warm welcome of ramdev and his parents shantha and nagesh as we drove through the gates of bettadamalali estate in early 2023 . a pristine farm nestled in the baba budan giri hills of chikmagalur , this place is more than a commercial operation – it is a garden of nature’s best delights . with scientific approaches to farming complemented by highly structured operations , bettadamalali produce some of the best washed arabicas we have encountered in recent times . the farm has been a long-term partner to our sister enterprise indcaffe – who have over the years exported several hundreds of tons of washed arabica for them to specialty houses in europe . we decided to grab our own little share of the treasure this year as we cycled through multiple lots to find this gem that would help us curate a delicious espresso . expect a sweet , rich-bodied liquor in your cup with a viscous creamy texture that will hug your palate closely before letting go . sweet honey and dark cocoa notes preceded by a savory aroma of roasted nuts together create a mesmerizing espresso that is testament to the great work of art that has been accomplished in the fields of bettadamalali estate . enjoy it black or hit it with milk ! honey cocoa roasted nuts our search for the immaculate espresso began when we were embraced by the warm welcome of ramdev and his parents shantha and nagesh as we drove through the gates of bettadamalali estate in early 2023 . a pristine farm nestled in the baba budan giri hills of chikmagalur , this place is more than a commercial operation – it is a garden of nature’s best delights . with scientific approaches to farming complemented by highly structured operations , bettadamalali produce some of the best washed arabicas we have encountered in recent times . the farm has been a long-term partner to our sister enterprise indcaffe – who have over the years exported several hundreds of tons of washed arabica for them to specialty houses in europe . we decided to grab our own little share of the treasure this year as we cycled through multiple lots to find this gem that would help us curate a delicious espresso . expect a sweet , rich-bodied liquor in your cup with a viscous creamy texture that will hug your palate closely before letting go . sweet honey and dark cocoa notes preceded by a savory aroma of roasted nuts together create a mesmerizing espresso that is testament to the great work of art that has been accomplished in the fields of bettadamalali estate . enjoy it black or hit it with milk

""" 
return_each_label(text.lower())

{'ESTATE': ['bettadamalali',
  'bettadamalali',
  'bettadamalali',
  'bettadamalali'],
 'COFFEE TYPE': ['arabicas', 'arabica', 'arabicas', 'arabica'],
 'BODY': ['sweet', 'rich', 'sweet', 'rich'],
 'TEXTURE': ['creamy', 'creamy'],
 'TASTING NOTES': ['sweet honey', 'dark cocoa', 'sweet honey', 'dark cocoa'],
 'AROMA': ['roasted nuts', 'roasted nuts']}