Skip to content

Workshop

Priya Jacob edited this page Dec 11, 2021 · 10 revisions

Welcome to the Pink Programming x Neo4j wiki!

Below are queries on a sample IMDB Movie dataset that you will use to load and query data to/from Neo4j

Inspecting your data source csv

//how many rows in all?

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/priya-jacob/pinkprogramming/main/movies.csv' AS row
RETURN COUNT(row)

//what does a row contain?

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/priya-jacob/pinkprogramming/main/movies.csv' AS row
RETURN row
LIMIT 1

//any missing values?

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/priya-jacob/pinkprogramming/main/movies.csv' AS row
WITH row 
WHERE row.gross = "null"
RETURN COUNT(row)

Load data

//lock & load - create constraints!

//earlier than neo4j ver 4.4 syntax
CREATE CONSTRAINT c_movie_title IF NOT EXISTS ON (m:Movie) ASSERT m.title IS UNIQUE;
CREATE CONSTRAINT c_movie_genre IF NOT EXISTS ON (g:Genre) ASSERT g.genre IS UNIQUE;
CREATE CONSTRAINT c_person_name IF NOT EXISTS ON (p:Person) ASSERT p.name IS UNIQUE;

//revised neo4j ver 4.4 syntax
CREATE CONSTRAINT c_movie_title IF NOT EXISTS FOR (n:Movie) REQUIRE n.title IS UNIQUE;
CREATE CONSTRAINT c_movie_genre IF NOT EXISTS FOR (n:Genre) REQUIRE n.genre IS UNIQUE;
CREATE CONSTRAINT c_person_name IF NOT EXISTS FOR (n:Person) REQUIRE n.name IS UNIQUE;

//lock & load - create nodes & relationships

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/priya-jacob/pinkprogramming/main/movies.csv' AS row
WITH row
MERGE (n:Movie {title: row.title})
SET 
n.year = row.year,
n.summary = row.summary,
n.runtime = toInteger(row.runtime),
n.certificate = row.certificate,
n.rating = toFloat(row.rating),
n.votes = toInteger(row.votes),
n.gross = toFloat(row.gross)
FOREACH (x IN split(row.genre, ',') | MERGE (g:Genre {genre: trim(x)}) MERGE (n)-[:GENRE]->(g))
FOREACH (x IN split(row.actors, ',') | MERGE (p:Person {name: trim(x)}) MERGE (p)-[:ACTOR]->(n))
FOREACH (x IN split(row.directors, ',') | MERGE (p:Person {name: trim(x)}) MERGE (p)-[:DIRECTOR]->(n))

Profile your graph

//check graph model

CALL db.schema.visualization()

//check graph model - alternative method

CALL apoc.meta.graph()

//check schema - constraints

CALL db.constraints()

//check schema - indexes

CALL db.indexes()

//check graph data statistics

CALL apoc.meta.stats()

Query your graph

//fetch all Person nodes

MATCH (n:Person)
RETURN n

//fetch all Movie nodes

MATCH (n:Movie)
RETURN n

//look for Tom Hanks

MATCH (n:Person {name:'Tom Hanks'})
RETURN n

//look for Tom Hanks - any difference between the two queries here?

MATCH (n:Person)
WHERE n.name = 'Tom Hanks'
RETURN n

//and one more with the new neo4j 4.4 cypher syntax!

MATCH (n:Person WHERE n.name = 'Tom Hanks')
RETURN n

//try string conditional filtering

MATCH (n:Movie)
WHERE n.title CONTAINS 'Knight'
RETURN n

//how many called Tom?

MATCH (n:Person)
WHERE n.name STARTS WITH 'Tom'
RETURN n

//did Tom act with Tom? notice something strange?

MATCH (p1:Person)-[a1:ACTOR]->(m:Movie)<-[a2:ACTOR]-(p2:Person)
WHERE p1.name STARTS WITH 'Tom'
AND p2.name STARTS WITH 'Tom'
RETURN p1.name, p2.name, m.title;

//break the symmetry!

MATCH (p1:Person)-[a1:ACTOR]->(m:Movie)<-[a2:ACTOR]-(p2:Person)
WHERE p1.name STARTS WITH 'Tom'
AND p2.name STARTS WITH 'Tom'
AND ID(p1) < ID(p2)
RETURN p1.name, p2.name, m.title;

//fetch all actors and their movies

MATCH p=(:Person)-[:ACTOR]->(:Movie)
RETURN p

//fetch all actors &/OR directors and their movies

MATCH p=(:Person)-[:ACTOR|DIRECTOR]->(:Movie)
RETURN p

//fetch those acted-in & directed a movie

MATCH p=(n:Person)-[:ACTOR]->(m:Movie)<-[:DIRECTOR]-(n)
RETURN p

//fetch those acted-in & directed a movie with their information

MATCH (n:Person)-[:ACTOR]->(m:Movie)<-[:DIRECTOR]-(n)
RETURN n.name AS person, COUNT(m) AS cnt, COLLECT(m.title) AS movies
ORDER BY cnt DESC

//movies with the most directors

MATCH (n:Movie)
WITH n, size(()-[:DIRECTOR]->(n)) AS cnt
WHERE cnt > 1
RETURN n.title AS movie, [(p)-[:DIRECTOR]->(n)|p.name] AS directors
ORDER BY cnt DESC

//top actors by #movies

MATCH (a:Person)-[:ACTOR]->(m:Movie)
WITH a, COLLECT(m.title) AS movies
WHERE size(movies) > 1
RETURN a.name AS actor, movies, size(movies) AS cnt
ORDER BY cnt DESC

//who’ve worked together the most?

MATCH (actor:Person)-[:ACTOR]->(m)<-[:ACTOR]-(coactor:Person)
WHERE ID(actor) < ID(coactor)
WITH actor, coactor, COUNT(m) AS cnt, COLLECT(m.title) AS movies
WHERE cnt > 1
RETURN actor.name, coactor.name, movies, cnt
ORDER BY cnt DESC  
LIMIT 5

//who must Tom act with? (a recommendation!)

MATCH (tom:Person {name: "Tom Hanks"})-[:ACTOR]->(m:Movie)<-[:ACTOR]-(coActors:Person)-[:ACTOR]->(oth:Movie)<-[:ACTOR]-(cocoActors:Person)
WHERE NOT EXISTS {(tom)-[:ACTOR]->()<-[:ACTOR]-(cocoActors)}
AND tom <> cocoActors
RETURN cocoActors.name AS recommended, collect(DISTINCT coActors.name) AS coactors, COUNT(DISTINCT coActors.name) AS strength 
ORDER BY strength DESC;

//how to get to Keanu Reeves from Tom Hanks?

MATCH path=shortestPath((:Person {name:'Tom Hanks'})-[:ACTOR|DIRECTOR*]-(:Person {name:'Keanu Reeves'}))
RETURN path

//or from Brad Pitt to Clint Eastwood? - feel free to find connections between your favorite pair of actors!

MATCH p=shortestPath((:Person {name:'Brad Pitt'})-[:ACTOR|DIRECTOR*]-(:Person {name:'Clint Eastwood'}))
RETURN p

//Tom Hanks’ connections

MATCH (tom:Person {name:"Tom Hanks"})
MATCH (oth:Person)
WHERE tom <> oth 
WITH tom, oth, shortestPath((tom)-[:ACTOR|DIRECTOR*..]-(oth)) AS path
WHERE length(path) > 0
WITH tom, oth, length(path) AS hops
ORDER BY hops ASC
RETURN hops/2 AS level, COLLECT(oth.name) AS connections

//Tom Hanks’ connections & mutual connections

MATCH (tom:Person {name:"Tom Hanks"})
MATCH (oth:Person)
WHERE tom <> oth 
WITH tom, oth, shortestPath((tom)-[:ACTOR|DIRECTOR*]-(oth)) AS path
WHERE length(path) > 0
WITH tom, oth, length(path) AS hops,
[x IN NODES(path) WHERE x <> tom AND x <> oth AND LABELS(x)=['Person'] |x.name] AS mutualConnections
RETURN oth.name AS connection, hops/2 AS level, mutualConnections
ORDER BY level ASC

//best years for movie-buffs?

MATCH (n:Movie)
RETURN n.year AS year, COUNT(n) AS cnt, AVG(n.rating) AS avgRating, MAX(n.votes) AS maxVotes, COLLECT(n.title)[..5] AS sampleTitles
ORDER BY cnt DESC, avgRating DESC

//to aid with node rule styling for neo4j bloom

MATCH (n:Movie)
RETURN min(n.gross), avg(n.gross), max(n.gross), percentileCont(n.gross, 0.5), percentileDisc(n.gross, 0.5)

MATCH (n:Movie)
RETURN min(n.votes), avg(n.votes), max(n.votes), percentileCont(n.votes, 0.5), percentileDisc(n.votes, 0.5)

MATCH (n:Movie)
RETURN min(n.rating), avg(n.rating), max(n.rating), percentileCont(n.rating, 0.5), percentileDisc(n.rating, 0.5)

//if you wish to delete what we just loaded!

//delete relationships first followed by nodes!
MATCH (:Person)-[r:ACTOR]->(:Movie)
DELETE r;
MATCH (:Person)-[r:DIRECTOR]->(:Movie)
DELETE r;
MATCH (:Movie)-[r:GENRE]->(:Genre)
DELETE r;
MATCH (n:Genre)
DELETE n;
MATCH (n:Person)
DELETE n;
MATCH (n:Movie)
DELETE n;

//or choose to delete everything in one go - use with caution - this will delete **everything** from your database!
MATCH (n)
DETACH DELETE n