In [72]:
import re
import os
from enum import Enum
import numpy as np
import pandas as pd
import geopandas as gpd
import fiona
import folium
from folium.plugins import Geocoder, MarkerCluster
from transformers import pipeline
import spacy
import dateparser
from datetime import datetime
import geoparquet as gpq

In [2]:
# Note - need to get spacy from hugging face
nlp = spacy.load("en_core_web_sm")

In [3]:
# Read in kml file of oldtime music gatherings
fiona.supported_drivers['KML'] = 'rw'
raw_points = gpd.read_file("./data/Old Time Open Jams.kml", driver="KML")
raw_points.head()

Unnamed: 0,Name,Description,geometry
0,"*Fairhope, AL",Jam n Folks meet every Tuesday (unless we are ...,POINT Z (-87.90006 30.52789 0.00000)
1,"*Anchorage, AK",Currently on hold<br>Irish session<br>Thursday...,POINT Z (-149.90654 61.19747 0.00000)
2,"*Ester, AK","The Golden Eagle Saloon<br>3630 Main St, Ester...",POINT Z (-148.01637 64.84680 0.00000)
3,"Fairbanks, AK",This jam is Zoom only until further notice. ...,POINT Z (-147.77080 64.83923 0.00000)
4,"*Fairbanks, AK",Irish Session ...,POINT Z (-147.71441 64.85332 0.00000)


In [4]:
#raw_points.to_csv("oldtime.csv")

In [5]:
#raw_points[raw_points["Description"].str.contains("picnic", case=False)].iloc[7]["Description"]

In [6]:
# Define pretrained model name
clf_model = "roberta-large-mnli"
q_model = "distilbert-base-cased-distilled-squad"

def load_model(model_task, model_name):
    '''
    Download pretrained model, or load existing downloaded model
    '''
    model_path = "../../ml_models/" + model_name + "/"
    if(not os.path.isdir(model_path)):
        print("Downloading model to %s\n" % model_path)
        os.makedirs(model_path)
        model = pipeline(task=model_task, model=model_name)
        model.save_pretrained(save_directory=model_path)
    else:
        print("Loading saved model...\n")
        model = pipeline(task=model_task, model=model_path, tokenizer=model_path)

    return model

# Load classifier model
classifier = load_model(model_task="zero-shot-classification", model_name=clf_model)

Loading saved model...



2023-03-26 22:31:41.894695: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ../../ml_models/roberta-large-mnli/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [7]:
# Test classifier
test = "Fiddler's Picnic. 3 day event, first weekend in August.<br>Aug 4-6, 2023<br>https://www.facebook.com/Montana-Old-Time-Fiddlers-Picnic-403947783036154/?<br><br>Updated CR 1/2/23"
classifier(test, candidate_labels=["weekly gathering", "annual event"])

{'sequence': "Fiddler's Picnic. 3 day event, first weekend in August.<br>Aug 4-6, 2023<br>https://www.facebook.com/Montana-Old-Time-Fiddlers-Picnic-403947783036154/?<br><br>Updated CR 1/2/23",
 'labels': ['annual event', 'weekly gathering'],
 'scores': [0.5418637990951538, 0.4581362009048462]}

In [8]:
# Load question answering model
question_answerer = load_model(model_task="question-answering", model_name=q_model)

Loading saved model...



Some layers from the model checkpoint at ../../ml_models/distilbert-base-cased-distilled-squad/ were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_1126']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at ../../ml_models/distilbert-base-cased-distilled-squad/ and are newly initialized: ['dropout_93']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
test = "Fiddler's Picnic. 3 day event, first weekend in August.<br>Aug 4-6, 2023<br>https://www.facebook.com/Montana-Old-Time-Fiddlers-Picnic-403947783036154/?<br><br>Updated CR 1/2/23"
#test2 = "93rd Annual Old Fiddler's Picnic<br>A Celebration of Music ~ A Reunion of Friends<br>No date set yet for 2023.<br>Join hundreds of musicians and music lovers to celebrate the 93rd Annual Chester County Old Fiddler's Picnic.<br><br> Hibernia Park <br> Gates open at 8:30 AM<br> Stage registration begins at 9 AM<br> Free Admission and Free Parking<br><br>https://www.chesco.org/2415/Old-Fiddlers-Picnic<br><br>JS 8/24/22"
preds = question_answerer(
    question="What date(s) does this event take place in 2023?",
    context=test
)
print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.759, start: 59, end: 66, answer: Aug 4-6


In [10]:
#ner = pipeline("ner", grouped_entities=True)
#ner(test)

In [33]:
# Filter out non-oldtime gatherings
oldtime_events = raw_points.copy()
oldtime_events = oldtime_events[~oldtime_events["Description"].str.contains(r"irish|celtic|scottish", case=False)]
print("Number of gatherings: %d" % oldtime_events.shape[0])
#print(oldtime_events["Description"].iloc[0])

# Remove dates that events were edited on the original map
oldtime_events["clean_desc"] = oldtime_events["Description"].str.replace(r"(\d+(\\|\/|-|\.)\d+(\\|\/|-|\.)\d+)$", "", case=False, regex=True)
print(oldtime_events["clean_desc"].iloc[0])

Number of gatherings: 1156
Jam n Folks meet every Tuesday (unless we are playing someplace) at the Nix Senior Center at 1 Bayou Drive ( a block and half east of North Section Street) in Fairhope. We practice from 9:00am to 12:00pm.<br><br>You can also chat with us at our Facebook page: https://www.facebook.com/groups/345412158921659/<br><br>Instrumentation in Jam n Folks includes mountain dulcimer, hammered dulcimer, mandolins, guitars, violins, viola, recorders, krumhorns, penny whistles, accordion and percussion as required. They have even let the webmaster of this site get away with bringing his soprano and baritone ukuleles. Grab your instrument of choice, come on over and play some tunes with us.                                                     JS 


In [34]:
def classify_event(text: str) -> str:
    labels = ["weekly gathering", "annual event"]
    clf_raw = classifier(text, candidate_labels=labels)
    clf_probs = clf_raw["scores"]
    clf_labels = clf_raw["labels"]
    #print("Raw probs", clf_probs)
    clf = clf_labels[np.argmax(clf_probs)]
    p = clf_probs[np.argmax(clf_probs)]
    print("Event type:", clf)
    #return clf, p
    return clf

def extract_date(text: str) -> str:
    # Get text representation of guess as to what date the event occurs
    preds = question_answerer(
        question="What days of the month can I attend this event (no phone numbers)?",
        context=text
    )
    score = round(preds["score"], 2)
    answer = preds['answer']
    print("Answer:", answer)

    # TODO: Try to solve the "2nd Sundays" problem... could just write regex for conversion?
    # Remove (any) second dates from date ranges (just want to know when it starts)
    #single_date = re.sub("-\d*(?=\s|,)", "", answer)

    # Get date components
    #doc = nlp(single_date)
    #date_ents = [str(ent) for ent in doc.ents if ent.label_ == "DATE"]
    #print("Date_ents:", date_ents)
    
    # Convert to single string
    #date_ents_str = " ".join(date_ents)

    # Remove elements that are not days of the week
    #date_ents_parsed = re.sub(pattern=r"\b(?!((sun|mon|tues|wednes|thurs|fri|sat)(day|\s*)\b))\w+", repl="", string=date_ents_str, flags=re.IGNORECASE)
    #print("Date ents parsed:", date_ents_parsed)

    # Get datetime object from text representation
    #date = dateparser.parse(date_string=date_ents_parsed, languages=["en"], settings={'PREFER_DATES_FROM': 'future'})
    
    #return (answer, date, score)
    return answer

#test = oldtime_events["clean_desc"].iloc[-5]
#print(test, '\n')
#print(classify_event(test))
#print(extract_date(oldtime_events["clean_desc"].iloc[0]))
for i in range(7):
    test = oldtime_events["clean_desc"].iloc[i]
    print(test)
    print(classify_event(test))
    #print(extract_date(test))
    print('\n')
#classify_event(test)

#print("NAs in category row: %d" % sum(oldtime_events["category"].isna()))

Jam n Folks meet every Tuesday (unless we are playing someplace) at the Nix Senior Center at 1 Bayou Drive ( a block and half east of North Section Street) in Fairhope. We practice from 9:00am to 12:00pm.<br><br>You can also chat with us at our Facebook page: https://www.facebook.com/groups/345412158921659/<br><br>Instrumentation in Jam n Folks includes mountain dulcimer, hammered dulcimer, mandolins, guitars, violins, viola, recorders, krumhorns, penny whistles, accordion and percussion as required. They have even let the webmaster of this site get away with bringing his soprano and baritone ukuleles. Grab your instrument of choice, come on over and play some tunes with us.                                                     JS 
Event type: weekly gathering
weekly gathering


The Golden Eagle Saloon<br>3630 Main St, Ester, AK, has irregular/impromptu mostly old time jam sessions. Contact Sina Anahita jmanahita@alaska.edu                                               JS 
Event type: we

In [35]:
# Add parsed dates to column (takes 3m 22s -> try to speed this up?)
oldtime_events["date_guess"] = oldtime_events["clean_desc"].map(extract_date)

Answer: Tuesday
Answer: irregular/impromptu mostly old time jam sessions
Answer: Tuesday
Answer: impromptu session
Answer: 2nd Sunday's - Sept thru May
Answer: First Sundays 1-4pm
Answer: Thursday
Answer: Tuesdays, 7-9pm
Answer: Thursday
Answer: October through April
Answer: jam in Show Low Arizona
Answer: 3rd Saturdays
Answer: Tuesday
Answer: Thursdays 1-3pm
Answer: 1st and 3rd Tuesdays, 2-4pm
Answer: Thursday
Answer: evening
Answer: second Sunday
Answer: second and fourth Wednesdays
Answer: Monday
Answer: 1 to 4 pm
Answer: 8:00 to 10:30 pm
Answer: 2nd & 4th Thursdays
Answer: 4th Sunday
Answer: fourth Sunday
Answer: 3rd Sundays
Answer: 3rd Sundays
Answer: Wednesday
Answer: second Saturday
Answer: 9th instead of the first Sunday
Answer: 1st Sundays
Answer: 4th Sundays
Answer: jam sessions
Answer: Second Saturday
Answer: Tuesdays
Answer: 12-2pm
Answer: First Saturday 2pm-4pm
Answer: Sunday
Answer: fourth Saturday
Answer: Thursday 8PM – 11PM
Answer: 2nd Sundays
Answer: 2nd Friday
Answer:

In [36]:
# Classify events by type (takes upwards of 30m -> try to speed this up?)
oldtime_events["type_guess"] = oldtime_events["clean_desc"].map(classify_event)

Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: annual event
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: annual event
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: weekly gathering
Event type: annual event
Event type: weekly gathering
Event type: weekly gatheri

In [51]:
# Write to disk
#oldtime_events.type
#for i in oldtime_events.columns:
#    print(type(oldtime_events[i].iloc[0]))
#oldtime_events.iloc[:5].to_geoparquet('./data/oldtime_events.geoparquet') -> writing to parquet throws crs error
# see https://github.com/darcy-r/geoparquet-python/issues/2
oldtime_events.to_file("./data/oldtime_events.geojson", driver="GeoJSON")

In [52]:
# Write to csv as backup
oldtime_events[["Name", "Description", "date_guess", "type_guess"]].to_csv("./data/oldtime_events_clean.csv")

In [75]:
# Create new folium map
m = folium.Map(location=[47.116386, -101.299591], zoom_start = 4)

# Create feature groups
jam_mc = MarkerCluster(name="Jams")
festivals_mc = MarkerCluster(name="Festivals / Camps")

# Create markers for each category and add to feature groups - color code
for i in range(oldtime_events.shape[0]):
    geom = oldtime_events["geometry"].iloc[i]
    name = oldtime_events["Name"].iloc[i]
    desc = oldtime_events["Description"].iloc[i]
    if(oldtime_events["type_guess"].iloc[i] == "weekly gathering"):
        marker = folium.Marker(location=[geom.y, geom.x], popup=desc, icon=folium.Icon(color="green"))
        marker.add_to(jam_mc)
    elif(oldtime_events["type_guess"].iloc[i] == "annual event"):
        marker = folium.Marker(location=[geom.y, geom.x], popup=desc, icon=folium.Icon(color="red"))
        marker.add_to(festivals_mc)

# Add feature group to map
jam_mc.add_to(m)
festivals_mc.add_to(m)

# Add layer control to map
m.add_child(folium.map.LayerControl())

In [76]:
Geocoder().add_to(m)
m

In [70]:
#title_html = "<h3>Old Time Open Jams</h3>"
#m.get_root().html.add_child(folium.Element(title_html))
#m

In [71]:
m.save("index.html")