In [105]:
import requests
import shutil
from pathlib import Path
import pandas as pd
from neo4j import GraphDatabase
import openai
from getpass import getpass
import re
import time
import random
import backoff

# Set up connections

In [2]:
openai_key = getpass()

 ········


In [66]:
neo4j_pwd = getpass()

 ········


In [3]:
openai.api_key = openai_key

In [67]:
bolt_uri = "neo4j+s://0932859c.databases.neo4j.io"
driver = GraphDatabase.driver(bolt_uri, auth=("neo4j", neo4j_pwd))

# Download data from UCI archive

In [4]:
response = requests.get("https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip", stream=True)
with open("sports_articles.zip",'wb') as output:
    output.write(response.content)  

In [5]:
shutil.unpack_archive("sports_articles.zip", "sports_articles")

In [6]:
p = Path('./sports_articles/Raw data')

In [7]:
articles = []
for d in p.iterdir():
    file_name = d.name
    try:
        with open(d, "rb") as f:
            text = f.read().decode("utf-8")
    except:
        with open(d, "rb") as f:
            text = f.read().decode("Windows-1252")
    articles.append({"article": file_name, "text": text})

In [8]:
articles_df = pd.DataFrame(articles)

In [9]:
articles_df.shape

(1000, 2)

# Pick out a few examples for few shot learning
I chose example with index 616 because it was short.

In [10]:
examples_df = articles_df.iloc[:2,:]

In [11]:
examples_df = examples_df._append(articles_df.loc[616])

In [12]:
articles_df.drop([0, 1, 616], inplace=True)

In [13]:
examples_df['entities'] = ""

In [14]:
print(examples_df.loc[0, "text"])

With a world title and an Olympic silver medal of their own already, Meryl Davis and Charlie White were quite happy to share this latest accomplishment.
  	
   	Davis and White won their fifth consecutive ice dance crown at the US Figure Skating Championships on Saturday, matching a record held by four other couples. As the audience stood and cheered, Davis knelt close to the ice for several seconds, her head bowed.
  	''Being in such an elite group of American ice dancers from the past and seeing that we belong with them, it's special,'' White said. ''All the hard work and our families and their dedication, our support group — you need a lot of things to come together to make that happen, including staying healthy. There are a lot of little things. I'm proud of us for being able to stick with it, and our continuing love for the sport has helped a lot.
  	''I'm pleased and I couldn't be more proud.''
  	Judy Schwomeyer and James Sladky (1968-72); Judy Blumberg and Michael Seibert (1981

Create examples of the types of entities we want the model to find.

In [15]:
examples_df.loc[0, "entities"] = "Sports:Ice dancing\nTeams:\nAthletes:Meryl Davis, Charlie, White, Judy Schwomeyer, James Sladky, Judy Blumberg, Michael Seibering, Namomi Lang, Peter Tchernyshev, Tanith Belbin, Ben Agosto, Madison Chock, Evan Bates, Maia Shibutani, Alex Shibutani, Marissa Castelli, Simon Shnapir, Tessa Virtue, Scott Moir\nSporting events:US Figure Skating Championships, Vancourver Olympics, 2010 World Figure Skating Championships, 2012 World Figure Skating Championships, 2011 World Figure Skating Championships, Figure Skating Grand Prix"

In [16]:
print(examples_df.loc[1, "text"])

The Croatia striker took advantage of a mistake from Christian Molinaro to put Bayern ahead in the 50th minute, intercepting the defender's back pass before rounding the goalkeeper.
  	The visitors put the game beyond doubt in the 72nd when Mandzukic crossed for Thomas Mueller to finish off a counterattack in the 72nd. Mueller earlier had a goal ruled out for offside.
  	
  	Bayern's 15th win from 19 games leaves it 11 points clear of Bayer Leverkusen, which was held to a 0-0 draw at Freiburg on Saturday. Defending champion Borussia Dortmund is third, a point further back.


In [17]:
examples_df.loc[1, "entities"] = "Sports:Soccer\nTeams:Bayern, Bayer Leverkusen, Frieburg, Borussia Dortmund\nAthletes:Christian Molinaro, Mario Mandzukic, Thomas Mueller\nSporting events:"

In [18]:
print(examples_df.loc[616, "text"])

Nico Rosberg led the way at the end of the first day of the second Formula One four-day test at the Circuit de Catalunya in Barcelona. The Mercedes driver posted a lap time of one minute 22.616 seconds, just 0.9 seconds slower than the pole time set by Lewis Hamilton in last year's Grand Prix.


Lotus' Kimi Raikkonen finished just 0.007sec adrift, with Ferrari's Fernando Alonso a further third of a second behind.


In [19]:
examples_df.loc[616, "entities"] = "Sports:Formula One\nTeams:Mercedes, Lotus, Ferrari\nAthletes: Nico Rosberg, Lewis Hamilton, Kimi Raikkonen, Fernando Alonso\nSporting events:Circuit de Catalunya in Barcelona, Grand Prix"

# See how long our articles are
We might not have enough tokens to pass the examples and long articles, so we'll break up the long ones.

In [20]:
articles_df['len'] = articles_df['text'].str.len()

In [21]:
articles_df.describe()

Unnamed: 0,len
count,997.0
mean,3972.991976
std,2741.899479
min,224.0
25%,1860.0
50%,3671.0
75%,5270.0
max,22373.0


# Send articles to openai to get entities.

In [None]:
result = response.json()
if ‘choices’ in result:
address = result[‘choices’][0][‘message’][‘content’]
return address
else:
retries += 1
print(f"Request failed. Retrying ({retries}/{max_retries})…")
time.sleep(2 ** retries) # Exponential backoff delay

In [126]:
ents_pattern = re.compile("Sports\:(.*)\nTeams\:(.*)\nAthletes\:(.*)\nSporting events\:(.*)", re.DOTALL)

@backoff.on_exception(backoff.expo, 
                      (openai.error.RateLimitError, 
                       openai.error.ServiceUnavailableError,
                       openai.error.APIError),
                     raise_on_giveup=False)
def get_entities(article):
    article_segments = [article[k:k+6000] for k in range(0, len(article), 6000)]
    sports, teams, athletes, events = [], [], [], []
    for segment in article_segments:
        messages = [
            {"role": "system", "content": "You extract entities in the following format:\nSports:<comma delimited list of strings>\nTeams:<comma delimited list of strings>\nAthletes:<comma delimited list of strings>\nSporting events:<comma delimited list of strings>"},
            {"role": "user", "content": examples_df.iloc[0, 1]},
            {"role": "assistant", "content": examples_df.iloc[0,2]},
            {"role": "user", "content": examples_df.iloc[1, 1]},
            {"role": "assistant", "content": examples_df.iloc[1,2]},
            {"role": "user", "content": examples_df.iloc[2,1]},
            {"role": "assistant", "content": examples_df.iloc[2,2]},
            {"role": "user", "content": segment}]
        retries = 0
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
            )
        result_match = ents_pattern.match(response['choices'][0]['message']['content'])
        if result_match:
            sports = sports + result_match.group(1).split(", ")
            teams = teams + result_match.group(2).split(", ")
            athletes = athletes + result_match.group(3).split(", ")
            events = events + result_match.group(4).split(", ")
        else:
            print(f"Result didn't match regex. {response}")
    entities = pd.Series({"sports": sports, "teams": teams, "athletes": athletes, "events": events})
    return entities

We ran into some time outs, so I did this in a few batches

In [None]:
entities_df = pd.DataFrame(columns=["sports", "teams", "athletes", "events"])

for idx, row in articles_df.iterrows():
    entities = get_entities(row['text'])
    entities.name = idx
    entities_df = entities_df._append(entities)
    if entities_df.shape[0] % 25 == 0:
        print(f"Processed {entities_df.shape[0]} articles.")

Processed 25 articles.
Processed 50 articles.
Result didn't match regex. {
  "id": "chatcmpl-7TUyjOXtI6SsvULEPC3znSYLaBdDo",
  "object": "chat.completion",
  "created": 1687264949,
  "model": "gpt-3.5-turbo-0301",
  "usage": {
    "prompt_tokens": 3126,
    "completion_tokens": 469,
    "total_tokens": 3595
  },
  "choices": [
    {
      "message": {
        "role": "assistant",
        "content": "valuating the game\u2026): \"Just like any other loss, it's painful. You put a lot into it. It's not the fact that it was a loss that's more difficult it's just the way we lost. Some defensive breakdowns and some big shots by Knight and Bynum. And it wasn't just that last play... It got to that point where it came down to a possession here or there. We weren't able to make the big play tonight.\n   (On the team's effort\u2026): \"The guys again had a great spirit about us, fought. We didn't shoot the ball the way that we're accustomed to, other than JJ of course. But we fought. We changed t

# Send the entities to Neo4j

In [87]:
output_df = articles_df.merge(entities_df, how="inner", left_index=True, right_index=True)

In [88]:
def send_row_to_neo4j(row):
    with driver.session() as session:
        session.run("""MERGE (a:Article {source: $article})
                       ON CREATE SET a.text = $text""", 
                    {"article": row["article"],
                     "text": row["text"]})
        session.run("""MATCH (a:Article {source: $article})
                       UNWIND $sports as sport
                       MERGE (s:Sport {name: toUpper(trim(sport))})
                       MERGE (a)-[:REFERENCES_SPORT]->(s)""",
                    {"article": row["article"],
                     "sports": row["sports"]})
        session.run("""MATCH (a:Article {source: $article})
                       UNWIND $athletes as athlete
                       MERGE (t:Athlete {name: toUpper(trim(athlete))})
                       MERGE (a)-[:REFERENCES_ATHLETE]->(t)""",
                    {"article": row["article"],
                     "athletes": row["athletes"]})
        session.run("""MATCH (a:Article {source: $article})
                       UNWIND $events as event
                       MERGE (e:Event {name: toUpper(trim(event))})
                       MERGE (a)-[:REFERENCES_EVENT]->(e)""",
                    {"article": row["article"],
                     "events": row["events"]})
                                                                                  

In [90]:
_ = output_df.apply(send_row_to_neo4j, axis=1)

# Look at results

In [91]:
with driver.session() as session:
    result = session.run("""MATCH (n) RETURN labels(n) as labels, count(*) as nodeCount""")
    result_df = pd.DataFrame([row.data() for row in result])
result_df

Unnamed: 0,labels,nodeCount
0,[Sport],69
1,[Athlete],1476
2,[Event],532
3,[Article],302


In [93]:
with driver.session() as session:
    result = session.run("""MATCH (s:Sport) 
                            RETURN s.name as sport, 
                            COUNT{ (s)<-[:REFERENCES_SPORT]-() } AS articleCount
                            ORDER BY articleCount DESC
                            LIMIT 10""")
    result_df = pd.DataFrame([row.data() for row in result])
result_df

Unnamed: 0,sport,articleCount
0,BASKETBALL,73
1,SOCCER,68
2,FOOTBALL,25
3,TENNIS,22
4,BASEBALL,15
5,NASCAR,11
6,GOLF,11
7,NFL,9
8,HOCKEY,8
9,FOOTBALL (NFL),7


In [95]:
with driver.session() as session:
    result = session.run("""MATCH (a:Athlete) 
                            RETURN a.name as athlete, 
                            COUNT{ (a)<-[:REFERENCES_ATHLETE]-() } AS articleCount
                            ORDER BY articleCount DESC
                            LIMIT 10""")
    result_df = pd.DataFrame([row.data() for row in result])
result_df

Unnamed: 0,athlete,articleCount
0,KOBE BRYANT,17
1,,16
2,LEBRON JAMES,15
3,TIGER WOODS,9
4,DWYANE WADE,9
5,CARMELO ANTHONY,7
6,SERENA WILLIAMS,7
7,DWIGHT HOWARD,7
8,ANDY MURRAY,7
9,NOVAK DJOKOVIC,7


In [99]:
with driver.session() as session:
    result = session.run("""MATCH (a:Athlete {name:"KOBE BRYANT"})-[:REFERENCES_ATHLETE]-(art)
                            MATCH (art)-[:REFERENCES_ATHLETE]->(n)
                            WHERE n <> a
                            RETURN n.name AS mentionedWithKobe, count(*) as articleCount
                            ORDER BY articleCount DESC
                            LIMIT 10""")
    result_df = pd.DataFrame([row.data() for row in result])
result_df

Unnamed: 0,mentionedWithKobe,articleCount
0,LEBRON JAMES,10
1,DWIGHT HOWARD,6
2,CARMELO ANTHONY,5
3,STEVE NASH,5
4,DWYANE WADE,4
5,PAU GASOL,4
6,RAY ALLEN,3
7,MICHAEL JORDAN,3
8,KEVIN DURANT,3
9,RAJON RONDO,2
