In [1]:
# !pip uninstall spacy

In [2]:
# !python -m spacy download en_core_web_sm

## Import the Spacy library and load the model

In [1]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [5]:
texts = [
        "Mark Zuckerberg is the CEO and founder of Facebook, which is based in Menlo Park.",
        "Elon Musk is the founder of SpaceX and the CEO of Tesla, located in Palo Alto.",
        "Sundar Pichai is the CEO of Google, and its headquarters is based in Mountain View.",
        "Bill Gates, the founder of Microsoft, is a prominent figure in the tech industry."
    ]

## Find the Named Entities in the Text

In [10]:
for text in texts:
    print(text,"\n")
    doc = nlp(text)
    for ent in doc.ents:
        print("text :",ent.text,"\nlabel :",ent.label_,'\nlabel defenition: ',spacy.explain(ent.label_))
    print()

Mark Zuckerberg is the CEO and founder of Facebook, which is based in Menlo Park. 

text : Mark Zuckerberg 
label : PERSON 
label defenition:  People, including fictional
text : Menlo Park 
label : GPE 
label defenition:  Countries, cities, states

Elon Musk is the founder of SpaceX and the CEO of Tesla, located in Palo Alto. 

text : Elon Musk 
label : PERSON 
label defenition:  People, including fictional
text : SpaceX 
label : PERSON 
label defenition:  People, including fictional
text : Tesla 
label : ORG 
label defenition:  Companies, agencies, institutions, etc.
text : Palo Alto 
label : GPE 
label defenition:  Countries, cities, states

Sundar Pichai is the CEO of Google, and its headquarters is based in Mountain View. 

text : Sundar Pichai 
label : PERSON 
label defenition:  People, including fictional
text : Google 
label : ORG 
label defenition:  Companies, agencies, institutions, etc.
text : Mountain View 
label : GPE 
label defenition:  Countries, cities, states

Bill Gate

## Use Graph Database to store the label and text

In [24]:
from neo4j import GraphDatabase
from spacy.matcher import Matcher
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "sadiesadie"))

In [25]:
def execute_query(query, parameters=None):
    with driver.session() as session:
        session.run(query, parameters)

In [26]:
def create_entity_nodes(entities):
    for entity in entities:
        label = ""
        if entity["label"] == "PERSON":
            label = "Person"
        elif entity["label"] == "ORG":
            label = "Organization"
        elif entity["label"] == "GPE":
            label = "Location"
        elif entity["label"] == "CONCEPT":
            label = "Concept"
        
        query = f"""
        MERGE (n:{label} {{name: $name}})
        """
        parameters = {"name": entity["text"]}
        execute_query(query, parameters)

In [27]:
def create_relationship(subject, object, relationship):
    query = """
    MATCH (s {name: $subject_name})
    MATCH (o {name: $object_name})
    MERGE (s)-[:RELATIONSHIP {name: $relationship}]->(o)
    """
    parameters = {
        "subject_name": subject,
        "object_name": object,
        "relationship": relationship
    }
    execute_query(query, parameters)

In [28]:
def extract_ner_entities(text):
    doc = nlp(text)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    return entities

In [32]:
# def detect_high_level_concepts(text):
#     doc = nlp(text)
#     matcher = Matcher(nlp.vocab)
    
#     # Define patterns for high-level concepts
#     patterns = [
#         [{"LOWER": "ceo"}],
#         [{"LOWER": "founder"}],
#         [{"LOWER": "based"}, {"LOWER": "in"}],
#         # Add more patterns as needed
#     ]
    
#     matcher.add("HighLevelConcepts", patterns)
#     matches = matcher(doc)
    
#     concepts = []
#     for match_id, start, end in matches:
#         span = doc[start:end]
#         concepts.append({"text": span.text, "label": "CONCEPT"})
    
#     return concepts

In [35]:
def process_multiple_texts_and_store_in_neo4j(texts):
    for text in texts:
        print(f"Processing text: {text}")
        # Step 1: Extract entities using NER
        entities = extract_ner_entities(text)
        
        # Step 2: Detect high-level concepts
        # concepts = detect_high_level_concepts(text)
        
        # Combine entities and concepts
        all_entities = entities
        
        # Step 3: Store entities and concepts in Neo4j as nodes
        create_entity_nodes(all_entities)
        
        # Step 4: Create relationships
        doc = nlp(text)
        for sent in doc.sents:
            for token in sent:
                if token.dep_ in ["nsubj", "nsubjpass"]:
                    subject = token.text
                    for child in token.head.children:
                        print(child,'  chch')
                        if child.dep_ in ["dobj", "pobj"]:
                            print('hii',token,child)
                            object = child.text
                            relationship = token.head.text
                            create_relationship(subject, object, relationship)

In [36]:
sample_texts = [
        "Mark Zuckerberg is the CEO and founder of Facebook, which is based in Menlo Park.",
        "Elon Musk is the founder of SpaceX and the CEO of Tesla, located in Palo Alto.",
        "Sundar Pichai is the CEO of Google, and its headquarters is based in Mountain View.",
        "Bill Gates, the founder of Microsoft, is a prominent figure in the tech industry."
    ]
    
    # Process each text and store it in Neo4j
process_multiple_texts_and_store_in_neo4j(sample_texts)
    
    # Close the Neo4j connection when done
driver.close()

Processing text: Mark Zuckerberg is the CEO and founder of Facebook, which is based in Menlo Park.


  with driver.session() as session:


Zuckerberg   chch
CEO   chch
.   chch
which   chch
is   chch
in   chch
Processing text: Elon Musk is the founder of SpaceX and the CEO of Tesla, located in Palo Alto.
Musk   chch
founder   chch
.   chch
Processing text: Sundar Pichai is the CEO of Google, and its headquarters is based in Mountain View.
Pichai   chch
CEO   chch
,   chch
and   chch
based   chch
headquarters   chch
is   chch
in   chch
.   chch
Processing text: Bill Gates, the founder of Microsoft, is a prominent figure in the tech industry.
Gates   chch
figure   chch
.   chch
