In [90]:
import os
import fitz
import re
import nltk
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams    

In [91]:
nltk.download("punkt")
nltk.download("stopwords")    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
input_folder = "Checking/"
output_folder = "25-3-2025_14_3/"
os.makedirs(output_folder, exist_ok=True) 

In [93]:
stop_words = set(stopwords.words("english")) 

In [105]:
unwanted_words = set({    
    "article", "et", "al", "in" , "to","and" , "for", "to", "a", "y", "is", "of", "all" ,"the","from", "are","terms", "conditions", "publication", "citation",
    "open", "access", "license", "cc", "by", "creative", "commons", "attribution",
    "shown", "method", "state", "date", "plot", "trials", "per", "cent", "new",
    "present", "iii", "iv", "v", "etc", "proc", "natl", "acad", "sci", "usa",
    "vol", "pp", "using", "also", "used", "based", "may", "however", "one", "two",
    "three", "four", "five", "data", "set", "including", "due", "figure", "table",
    "fig", "found", "work", "among", "study", "analysis", "different", "several",
    "order", "low", "high", "higher", "lower", "within", "between", "without",
    "results", "approach", "across", "group", "suggest", "suggests", "indicate",
    "indicates", "according", "within", "among", "amongst", "within", "even",
    "although", "further", "well", "known", "previously", "recent", "recently",
    "first", "second", "third", "various", "varied", "example", "examples",
    "others", "another", "obtained",  "show", "shows", "including",
    "would", "could", "can", "might", "many", "much", "several", "certain",
    "some", "such", "well", "particular", "various", "often", "sometimes",
    "always", "never", "previous", "past", "future", "new", "old", "young",
    "common", "uncommon", "rare", "frequent",
    "other", "additional", "extra", "further", 
    "pros", "cons", "effective", "ineffective", "efficacy", "efficiency", "slow",
    "early", "late", "earlier", "latest", "delayed",
    "quick", "quicker", "quickest", "rapid", "rapidly", "slowly", "gradual",
    "sudden", "short", "long", "shorter", "shortest", "longer", "longest",
    "temporary", "permanent", "transient", "persistent", 
    "mild", "moderate", "severe", "slight", "significant", "insignificant",
    "noticeable", "unnoticeable", "detectable",
    "wiley", "online", "library", "downloaded", "https", "see", "rules", "use", "oa", "articles", "governed", "applicable",
    "national", "center", "health", "statistics", "centers",
    "mortality", "public", "tapes", "american", "society", "atlanta", "tape", "reviews", "december", "volume", "nature",
    "publishing", "average", "annual", "percent", "change", "note", "trends", "analyzed", "joinpoint", "program",
    "total", "leading", "causes", "us", "standard", "end", "seer", "department",
    "individual", "b", "dvm", "phd", "year", "approximately", "fewer", "cases", "estimated","alabama", "alaska", "arizona", "arkansas", "california", "colorado", "connecticut", "delaware", "dist",
    "columbia", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky",
    "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri",
    "montana", "nebraska", "nevada", "hampshire", "jersey", "mexico", "york", "north", "carolina", "dakota",
    "ohio", "oklahoma", "oregon", "pennsylvania", "rhode", "island", "south", "tennessee", "texas", "utah",
    "vermont", "virginia", "washington", "west", "wisconsin", "wyoming", "ca", "clin", "j", "apc", "aapc", "ons", "mp", "cl", "ries", "lag", "eisner", "naaccr" 
}) 

In [106]:
def extract_text(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    capture = False
    for page in doc:
        page_text = page.get_text()
        if "abstract" in page_text.lower():
            capture = True
        if "references" in page_text.lower():
            capture = False
        if capture:
            text += page_text + " "
    return text.lower()    

In [107]:
global_ngram = {i: Counter() for i in range(1, 5)}  

In [108]:
for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdf"):
        pdf_data = os.path.join(input_folder, file_name)
        raw_text = extract_text(pdf_data)

        if not raw_text:
            continue

        tokens = word_tokenize(raw_text)
        tokens = [word for word in tokens if word.isalpha() and word not in unwanted_words]

        for n in range(1, 5):
            global_ngram[n].update(Counter(ngrams(tokens, n)))  

In [109]:
ngram_file_path = os.path.join(output_folder, "ngrams.txt")
with open(ngram_file_path, "w", encoding="utf-8") as ngram_file:
    for n in range(1, 5):
        ngram_file.write(f"\n{n}-grams:\n")
        for ngram, count in global_ngram[n].items():
            ngram_file.write(f"{' '.join(ngram)}: {count}\n")
        ngram_file.write("\n" + "=" * 50 + "\n") 

In [110]:
def filter_ngrams(global_ngram):
    filtered_ngrams = {n: Counter() for n in range(1, 5)}
    
    for n in range(1, 5):
        for ngram, count in global_ngram[n].items():
            is_redundant = False
            for m in range(n + 1, 5):
                for higher_ngram in global_ngram[m]:
                    if set(ngram).issubset(set(higher_ngram)):
                        is_redundant = True
                        break
                if is_redundant:
                    break
            if not is_redundant:
                filtered_ngrams[n][ngram] = count

    return filtered_ngrams     

In [111]:
cleaned_ngrams = filter_ngrams(global_ngram) 

In [112]:
clean_ngram_file_path = os.path.join(output_folder, "clean_ngram.txt")
with open(clean_ngram_file_path, "w", encoding="utf-8") as clean_file:
    for n in range(1, 5):
        clean_file.write(f"\n{n}-grams (cleaned):\n")
        for ngram, count in cleaned_ngrams[n].items():
            clean_file.write(f"{' '.join(ngram)}: {count}\n")
        clean_file.write("\n" + "=" * 50 + "\n") 

In [113]:
def save_top_ngrams(filtered_ngrams, filename="top_200ngram.txt", top_n=200):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Top 200 Bigrams:\n")
        for ngram, count in filtered_ngrams[2].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 200 Trigrams:\n")
        for ngram, count in filtered_ngrams[3].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 200 Fourgrams:\n")
        for ngram, count in filtered_ngrams[4].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n") 

In [114]:
save_top_ngrams(cleaned_ngrams, os.path.join(output_folder, "top_200ngram.txt"))  

In [118]:
def build_cooccurrence_matrix(input_folder, top_ngrams_file, output_file):
    paper_files = [f for f in os.listdir(input_folder) if f.endswith(".pdf")]
    fourgrams = []
    with open(top_ngrams_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        capture = False
        for line in lines:
            if "Top 200 Fourgrams:" in line:
                capture = True 
                continue
            if capture and ":" in line:
                fourgrams.append(line.split(":")[0].strip())

    cooccurrence_matrix = pd.DataFrame(0, index=fourgrams, columns=paper_files)

    for file_name in paper_files:
        pdf_data = os.path.join(input_folder, file_name)
        raw_text = extract_text(pdf_data)

        if not raw_text:
            continue

        tokens = [word for word in word_tokenize(raw_text) if word.isalpha() and word not in stop_words]
        paper_ngrams = Counter(ngrams(tokens, 4))

        for fourgram in fourgrams:
            fourgram_tuple = tuple(fourgram.split())
            cooccurrence_matrix.at[fourgram, file_name] = paper_ngrams.get(fourgram_tuple, 0)

    cooccurrence_matrix.to_csv(output_file)

top_ngrams_path = os.path.join(output_folder, "top_200ngram.txt")
cooccurrence_output = os.path.join(output_folder, "fourgram_cooccurrence.csv")

build_cooccurrence_matrix(input_folder, top_ngrams_path, cooccurrence_output) 

In [122]:
import pandas as pd
import os

file_path = os.path.join(output_folder, "fourgram_cooccurrence.csv")
cooccurrence_matrix = pd.read_csv(file_path, index_col=0)

jaccard_similarity = {}

for paper in cooccurrence_matrix.columns:
    paper_fourgrams = set(cooccurrence_matrix[paper][cooccurrence_matrix[paper] > 0].index)
    
    similarities = {}
    for fourgram in cooccurrence_matrix.index:
        fourgram_set = {fourgram}
        intersection = len(fourgram_set & paper_fourgrams)
        union = len(fourgram_set | paper_fourgrams)
        similarity = intersection / union if union != 0 else 0
        similarities[fourgram] = similarity
    
    jaccard_similarity[paper] = similarities

jaccard_df = pd.DataFrame(jaccard_similarity)
jaccard_df.to_csv(os.path.join(output_folder, "fourgram_jaccard_similarity.csv")) 

In [125]:
!pip install spacy
!python -m spacy download en_core_web_sm 

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
                                              0.0/12.8 MB ? eta -:--:--
                                              0.0/12.8 MB ? eta -:--:--
                                              0.0/12.8 MB ? eta -:--:--
                                             0.1/12.8 MB 558.5 kB/s eta 0:00:23
                                             0.1/12.8 MB 581.0 kB/s eta 0:00:22
                                             0.1/12.8 MB 653.6 kB/s eta 0:00:20
                                             0.1/12.8 MB 653.6 kB/s eta 0:00:20
                                             0.1/12.8 MB 653.6 kB/s eta 0:00:20
                                             0.3/12.8 MB 850.6 kB/s eta 0:00:15
                                             0.3/12.8 MB 922.8 kB/s eta 0:00:14
     -                                       0.4

In [126]:
import os
import spacy
import pandas as pd

output_folder = "25-3-2025_14_3/"
input_file = os.path.join(output_folder, "fourgram_cooccurrence.csv")

if not os.path.exists(input_file):
    raise FileNotFoundError(f"{input_file} not found!")

nlp = spacy.load("en_core_web_sm")

def analyze_dependency_structure(text):
    doc = nlp(text)
    return [(token.text, token.dep_, token.head.text, token.head.pos_) for token in doc]

df = pd.read_csv(input_file, index_col=0)

dependency_data = []
for fourgram in df.index:
    dependencies = analyze_dependency_structure(fourgram)
    for word, dep, head, head_pos in dependencies:
        dependency_data.append([fourgram, word, dep, head, head_pos])

dependency_df = pd.DataFrame(dependency_data, columns=["Fourgram", "Word", "Dependency", "Head", "Head_POS"])
output_file = os.path.join(output_folder, "fourgram_dependency_parsing.csv")
dependency_df.to_csv(output_file, index=False)

print("Dependency parsing completed. Results saved in:", output_file)
dependency_df.head()                     

Dependency parsing completed. Results saved in: 25-3-2025_14_3/fourgram_dependency_parsing.csv


Unnamed: 0,Fourgram,Word,Dependency,Head,Head_POS
0,male female male female,male,amod,female,NOUN
1,male female male female,female,amod,female,NOUN
2,male female male female,male,amod,female,NOUN
3,male female male female,female,ROOT,female,NOUN
4,cancer cancer journal clinicians,cancer,compound,cancer,NOUN


In [128]:
import os
import spacy
import pandas as pd

output_folder = "25-3-2025_14_3/"
input_file = os.path.join(output_folder, "fourgram_dependency_parsing.csv")

if not os.path.exists(input_file):
    raise FileNotFoundError(f"{input_file} not found!")

nlp = spacy.load("en_core_web_sm")

def extract_SVO(text):
    doc = nlp(text)
    subject, verb, obj = None, None, None
    
    for token in doc:
        if token.dep_ in {"nsubj", "nsubjpass"}:
            subject = token.text
        if token.dep_ == "ROOT":
            verb = token.text
        if token.dep_ in {"dobj", "pobj"}:
            obj = token.text
            
    return (subject, verb, obj) if subject and verb and obj else None

df = pd.read_csv(input_file)
svos = []

for fourgram in df["Fourgram"].unique():
    svo_triplet = extract_SVO(fourgram)
    if svo_triplet:
        svos.append([fourgram] + list(svo_triplet))

svo_df = pd.DataFrame(svos, columns=["Fourgram", "Subject", "Verb", "Object"])
output_file = os.path.join(output_folder, "fourgram_SVO_triplets.csv")
svo_df.to_csv(output_file, index=False) 
print("SVO extraction completed. Results saved in:", output_file)
svo_df.head() 

SVO extraction completed. Results saved in: 25-3-2025_14_3/fourgram_SVO_triplets.csv


Unnamed: 0,Fourgram,Subject,Verb,Object
0,incidence rates selected cancers,rates,selected,cancers
1,estimates not add rounding,estimates,add,rounding
2,rates selected cancers sex,rates,selected,sex
3,injuries accidents tional injuries,injuries,accidents,injuries
4,rates selected cancers race,rates,selected,race


In [133]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_svo(text):
    doc = nlp(text)
    svos = []
    for token in doc:
        if token.dep_ in {"ROOT", "xcomp"} and token.pos_ == "VERB":
            subject = [w.text for w in token.lefts if w.dep_ in {"nsubj", "nsubjpass"}]
            objects = [w.text for w in token.rights if w.dep_ in {"dobj", "pobj", "attr"}]
            if subject and objects:
                svos.append((subject[0], token.text, objects[0]))
    return svos

def build_relation_mapping(fourgram_file, output_file):
    df = pd.read_csv(fourgram_file, index_col=0)
    relations = []
    
    for fourgram in df.index:
        text = fourgram.replace("_", " ")
        svo_triplets = extract_svo(text)
        
        for subj, verb, obj in svo_triplets:
            relations.append({"Fourgram": fourgram, "Subject": subj, "Verb": verb, "Object": obj})

    relation_df = pd.DataFrame(relations)
    relation_df.to_csv(output_file, index=False)

build_relation_mapping("25-3-2025_14_3/fourgram_cooccurrence.csv", "25-3-2025_14_3/relation_mapping.csv") 

In [1]:
import os
import spacy
import pandas as pd
import json

output_folder = "25-3-2025_14_3/"
input_file = os.path.join(output_folder, "fourgram_cooccurrence.csv")
if not os.path.exists(input_file):
    raise FileNotFoundError(f"{input_file} not found!")
    
nlp = spacy.load("en_core_web_sm")

def analyze_dependency_structure(text):
    doc = nlp(text)
    return [(token.text, token.dep_, token.head.text, token.head.pos_) for token in doc]

df = pd.read_csv(input_file, index_col=0)

dependency_dict = {}

for fourgram in df.index:
    dependencies = analyze_dependency_structure(fourgram)
    
    dependency_dict[fourgram] = [
        {"word": word, "dependency": dep, "head": head, "head_pos": head_pos}
        for word, dep, head, head_pos in dependencies
    ]

output_file = os.path.join(output_folder, "fourgram_dependency_parsing.json")

with open(output_file, "w") as json_file:
    json.dump(dependency_dict, json_file, indent=4)

print("Dependency parsing completed. Results saved in:", output_file) 

Dependency parsing completed. Results saved in: 25-3-2025_14_3/fourgram_dependency_parsing.json


In [16]:
import json
import csv
import os

output_folder = "25-3-2025_14_3/"
input_json = os.path.join(output_folder, "fourgram_dependency_parsing.json")
output_csv = os.path.join(output_folder, "relations_extracted.csv")

if not os.path.exists(input_json):
    raise FileNotFoundError(f"{input_json} not found!")

def extract_relations_from_json(input_json, output_csv):
    with open(input_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    relations = []

    for fourgram, dependencies in data.items():
        subject, verb, obj = None, None, None
        possessor, modifier, preposition = None, None, None

        for dep in dependencies:
            word, dep_type, head, head_pos = dep["word"], dep["dependency"], dep["head"], dep["head_pos"]

            if dep_type == "nsubj":
                subject = word
            elif dep_type == "ROOT":
                verb = word
            elif dep_type in ("dobj", "obj"):
                obj = word
            elif dep_type == "poss":
                possessor = (word, head)
            elif dep_type == "amod":
                modifier = (word, head)
            elif dep_type == "prep":
                preposition = (word, head)

        if subject and verb and obj:
            relations.append((fourgram, "SVO", subject, verb, obj))

        if possessor:
            relations.append((fourgram, "Possessive", possessor[0], "owns", possessor[1]))

        if modifier:
            relations.append((fourgram, "Modifier", modifier[0], "describes", modifier[1]))

        if preposition:
            relations.append((fourgram, "Preposition", preposition[0], "related to", preposition[1]))

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Fourgram", "Relation Type", "Entity1", "Relation", "Entity2"])
        writer.writerows(relations)

    print(f"Relations extracted and saved to {output_csv}")

extract_relations_from_json(input_json, output_csv) 

Relations extracted and saved to 25-3-2025_14_3/relations_extracted.csv


In [1]:
pip install py2neo pandas 

Collecting py2neo
  Downloading py2neo-2021.2.4-py2.py3-none-any.whl (177 kB)
                                              0.0/177.2 kB ? eta -:--:--
                                              0.0/177.2 kB ? eta -:--:--
     ---------                                41.0/177.2 kB ? eta -:--:--
     -----------                             51.2/177.2 kB 1.3 MB/s eta 0:00:01
     --------------                        71.7/177.2 kB 660.6 kB/s eta 0:00:01
     --------------------                 102.4/177.2 kB 590.8 kB/s eta 0:00:01
     --------------------                 102.4/177.2 kB 590.8 kB/s eta 0:00:01
     --------------------                 102.4/177.2 kB 590.8 kB/s eta 0:00:01
     ---------------------------          133.1/177.2 kB 437.3 kB/s eta 0:00:01
     -------------------------------      153.6/177.2 kB 482.7 kB/s eta 0:00:01
     ------------------------------------ 177.2/177.2 kB 445.3 kB/s eta 0:00:00
Collecting interchange~=2021.0.4 (from py2neo)
  Downloading i

In [2]:
pip install neo4j 

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
                                              0.0/312.3 kB ? eta -:--:--
     -------------                          112.6/312.3 kB 3.3 MB/s eta 0:00:01
     --------------                         122.9/312.3 kB 1.4 MB/s eta 0:00:01
     ---------------------------------      276.5/312.3 kB 2.1 MB/s eta 0:00:01
     -------------------------------------  307.2/312.3 kB 1.6 MB/s eta 0:00:01
     -------------------------------------- 312.3/312.3 kB 1.5 MB/s eta 0:00:00
Installing collected packages: neo4j
Successfully installed neo4j-5.28.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import os

output_folder = "25-3-2025_14_3/"
input_csv = os.path.join(output_folder, "fourgram_dependency_parsing.csv")

df = pd.read_csv(input_csv)
print(df.head())  

                           Fourgram    Word Dependency    Head Head_POS
0           male female male female    male       amod  female     NOUN
1           male female male female  female       amod  female     NOUN
2           male female male female    male       amod  female     NOUN
3           male female male female  female       ROOT  female     NOUN
4  cancer cancer journal clinicians  cancer   compound  cancer     NOUN


In [4]:
def extract_relations(df):
    relations = []
    
    for fourgram in df["Fourgram"].unique():
        subset = df[df["Fourgram"] == fourgram]
        
        subject, verb, obj = None, None, None

        for _, row in subset.iterrows():
            word, dep, head = row["Word"], row["Dependency"], row["Head"]

            if dep == "nsubj": 
                subject = word
            elif dep == "ROOT": 
                verb = word
            elif dep in ("dobj", "obj"): 
                obj = word

        if subject and verb and obj:
            relations.append((subject, verb, obj))

    return relations

relations = extract_relations(df)
print(relations[:5])  

[('rates', 'selected', 'cancers'), ('estimates', 'add', 'rounding'), ('rates', 'selected', 'sex'), ('injuries', 'accidents', 'injuries'), ('rates', 'selected', 'race')]


In [7]:
from neo4j import GraphDatabase

URI = "neo4j+s://79977030.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD="whEzm2r-M-_QklgGy-fKMwfcRAyZcixTHlxjiU19xwk"
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self._driver.close()

    def run_query(self, query, parameters={}):
        with self._driver.session() as session:
            return session.run(query, parameters)

# Connect to Neo4j
conn = Neo4jConnection(URI, NEO4J_USER, NEO4J_PASSWORD)
print("✅ Successfully connected to Neo4j!")  

✅ Successfully connected to Neo4j!


In [8]:
def store_relations_in_neo4j(conn, relations):
    query = """
    MERGE (s:Entity {name: $subject})
    MERGE (v:Action {name: $verb})
    MERGE (o:Entity {name: $object})
    MERGE (s)-[:PERFORMS]->(v)
    MERGE (v)-[:AFFECTS]->(o)
    """
    
    for subject, verb, obj in relations:
        conn.run_query(query, {"subject": subject, "verb": verb, "object": obj})

    print("✅ Relations stored successfully in Neo4j!")

store_relations_in_neo4j(conn, relations)
conn.close() 

✅ Relations stored successfully in Neo4j!


In [6]:
from neo4j import GraphDatabase

# Define the connection parameters for Neo4j
uri = "bolt://localhost:7687"  # Neo4j URI (replace if using Neo4j Aura or another URI)
user = "neo4j"  # Neo4j Username (default is "neo4j")
password = "Sourabh@123"  # Replace with your Neo4j password

# Create a connection to Neo4j
class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self._uri = uri
        self._user = user
        self._password = pwd
        self._driver = GraphDatabase.driver(uri, auth=(user, pwd))

    def close(self):
        self._driver.close()

    def query(self, query, parameters=None):
        with self._driver.session() as session:
            result = session.run(query, parameters)
            return result

# Instantiate the connection object
connection = Neo4jConnection(uri, user, password) 

In [8]:
import pandas as pd
svo_data = pd.read_csv("25-3-2025_14_3/fourgram_SVO_reaction.csv")
dependency_data = pd.read_csv("25-3-2025_14_3/fourgram_dependency_parsing.csv")
print(svo_data.head())
print(dependency_data.head()) 

     Subject       Verb    Object
0      rates   selected   cancers
1  estimates        add  rounding
2      rates   selected       sex
3   injuries  accidents  injuries
4      rates   selected      race
                           Fourgram    Word Dependency    Head Head_POS
0           male female male female    male       amod  female     NOUN
1           male female male female  female       amod  female     NOUN
2           male female male female    male       amod  female     NOUN
3           male female male female  female       ROOT  female     NOUN
4  cancer cancer journal clinicians  cancer   compound  cancer     NOUN


In [9]:
def extract_svo_triples(svo_data):
    svo_triples = []
    for _, row in svo_data.iterrows():
        subject = row['Subject']
        verb = row['Verb']
        obj = row['Object']
        svo_triples.append((subject, verb, obj))
    return svo_triples

svo_triples = extract_svo_triples(svo_data)
print(svo_triples[:5])  

[('rates', 'selected', 'cancers'), ('estimates', 'add', 'rounding'), ('rates', 'selected', 'sex'), ('injuries', 'accidents', 'injuries'), ('rates', 'selected', 'race')]


In [10]:
def store_svo_in_neo4j(subject, verb, obj):
    query = f"""
    MERGE (s:Subject {{name: '{subject}'}})
    MERGE (v:Verb {{action: '{verb}'}})
    MERGE (o:Object {{name: '{obj}'}})
    MERGE (s)-[:PERFORMS]->(v)-[:ACTS_ON]->(o)
    """
    connection.query(query)
    
for subject, verb, obj in svo_triples:
    store_svo_in_neo4j(subject, verb, obj) 

In [11]:
connection.close() 

In [12]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"  
user = "neo4j" 
password = "Sourabh@123"

class Neo4jConnection:
    def __init__(self, uri, user, pwd):
        self._uri = uri
        self._user = user
        self._password = pwd
        self._driver = GraphDatabase.driver(uri, auth=(user, pwd))

    def close(self):
        self._driver.close()

    def query(self, query, parameters=None):
        with self._driver.session() as session:
            result = session.run(query, parameters)
            return result

        
connection = Neo4jConnection(uri, user, password)         

In [13]:
def store_dependency_in_neo4j(fourgram, word, dependency, head, head_pos):
    query = f"""
    MERGE (w:Word {{text: '{word}', fourgram: '{fourgram}'}})
    MERGE (h:Word {{text: '{head}', pos: '{head_pos}'}})
    MERGE (w)-[:{dependency.upper()}]->(h)
    """
    connection.query(query)
             
for _, row in dependency_data.iterrows():
    fourgram = row['Fourgram']
    word = row['Word']
    dependency = row['Dependency']
    head = row['Head']
    head_pos = row['Head_POS']
    
    store_dependency_in_neo4j(fourgram, word, dependency, head, head_pos) 

In [14]:
connection.close() 

In [3]:
import os
import spacy
import pandas as pd
import json

output_folder = "25-3-2025_14_3"
cooccurrence_file = os.path.join(output_folder, "fourgram_cooccurrence.csv")
jaccard_threshold = 0.3  

if not os.path.exists(cooccurrence_file):
    raise FileNotFoundError(f"{cooccurrence_file} not found!")

jac_df = pd.read_csv(cooccurrence_file, index_col=0)
jac_df["Avg_Jaccard"] = jac_df.mean(axis=1)

print("Jaccard Similarity Distribution:")
print(jac_df["Avg_Jaccard"].describe())

filtered_fourgrams = jac_df[jac_df["Avg_Jaccard"] >= jaccard_threshold].index.tolist()
print(f"Number of Fourgrams after filtering: {len(filtered_fourgrams)}")

nlp = spacy.load("en_core_web_sm")


dependency_data = {}
for fg in filtered_fourgrams:
    doc = nlp(fg.replace("_", " "))
    dependency_data[fg] = [
        {
            "word": token.text,
            "dependency": token.dep_,
            "head": token.head.text,
            "head_pos": token.head.pos_,
        }
        for token in doc
    ]

dep_json_path = os.path.join(output_folder, "dependency_parsing_jaccard.json")
with open(dep_json_path, "w") as f:
    json.dump(dependency_data, f, indent=4)

dep_rows = []
for fg, deps in dependency_data.items():
    for d in deps:
        dep_rows.append([fg, d["word"], d["dependency"], d["head"], d["head_pos"]])

dep_df = pd.DataFrame(dep_rows, columns=["Fourgram", "Word", "Dependency", "Head", "Head_POS"])
dep_csv_path = os.path.join(output_folder, "dependency_parsing_jaccard.csv")
dep_df.to_csv(dep_csv_path, index=False)

svo_list = []
for fg in filtered_fourgrams:
    doc = nlp(fg.replace("_", " "))
    subject, verb, obj = None, None, None
    for token in doc:
        if token.dep_ in {"nsubj", "nsubjpass"}:
            subject = token.text
        elif token.dep_ == "ROOT":
            verb = token.text
        elif token.dep_ in {"dobj", "pobj", "obj"}:
            obj = token.text
    if subject and verb and obj:
        svo_list.append({"Fourgram": fg, "Subject": subject, "Verb": verb, "Object": obj})

svo_json_path = os.path.join(output_folder, "SVO_jaccard.json")
with open(svo_json_path, "w") as f:
    json.dump(svo_list, f, indent=4)

svo_df = pd.DataFrame(svo_list)
svo_csv_path = os.path.join(output_folder, "SVO_jaccard.csv") 
svo_df.to_csv(svo_csv_path, index=False)

print("Dependency parsing and SVO extraction complete and saved.")

Jaccard Similarity Distribution:
count    200.000000
mean       0.641429
std        0.681139
min        0.000000
25%        0.000000
50%        0.571429
75%        0.857143
max        4.142857
Name: Avg_Jaccard, dtype: float64
Number of Fourgrams after filtering: 125
Dependency parsing and SVO extraction complete and saved.


In [4]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_svo(text):
    doc = nlp(text)
    svos = []
    for token in doc:
        if token.dep_ in {"ROOT", "xcomp"} and token.pos_ == "VERB":
            subject = [w.text for w in token.lefts if w.dep_ in {"nsubj", "nsubjpass"}]
            objects = [w.text for w in token.rights if w.dep_ in {"dobj", "pobj", "attr"}]
            if subject and objects:
                svos.append((subject[0], token.text, objects[0]))
    return svos

def build_relation_mapping(fourgram_file, output_file):
    df = pd.read_csv(fourgram_file, index_col=0)
    relations = []
    
    for fourgram in df.index:
        text = fourgram.replace("_", " ")
        svo_triplets = extract_svo(text)
        
        for subj, verb, obj in svo_triplets:
            relations.append({"Fourgram": fourgram, "Subject": subj, "Verb": verb, "Object": obj})

    relation_df = pd.DataFrame(relations)
    relation_df.to_csv(output_file, index=False)

build_relation_mapping("25-3-2025_14_3/dependency_parsing_jaccard.csv", "25-3-2025_14_3/relation_mapping_jaccard.csv") 

In [12]:
import pandas as pd
import os

output_folder = "25-3-2025_14_3/"
input_csv = os.path.join(output_folder, "dependency_parsing_jaccard.csv")

df = pd.read_csv(input_csv)
print(df.head())          

                  Fourgram    Word Dependency    Head Head_POS
0  male female male female    male       amod  female     NOUN
1  male female male female  female       amod  female     NOUN
2  male female male female    male       amod  female     NOUN
3  male female male female  female       ROOT  female     NOUN
4  female male female male  female       amod    male     NOUN


In [9]:
def extract_relations(df):
    relations = []
    
    for fourgram in df["Fourgram"].unique():
        subset = df[df["Fourgram"] == fourgram]
        
        subject, verb, obj = None, None, None

        for _, row in subset.iterrows():
            word, dep, head = row["Word"], row["Dependency"], row["Head"]

            if dep == "nsubj": 
                subject = word
            elif dep == "ROOT": 
                verb = word
            elif dep in ("dobj", "obj"): 
                obj = word

        if subject and verb and obj:
            relations.append((subject, verb, obj))

    return relations 

relations = extract_relations(df)
print(relations[:5])   

[('rates', 'selected', 'cancers'), ('rates', 'selected', 'sex'), ('injuries', 'accidents', 'injuries'), ('rates', 'selected', 'race'), ('rates', 'adjusted', 'life')]


In [10]:
import pandas as pd
svo_data = pd.read_csv("25-3-2025_14_3/SVO_jaccard.csv")
dependency_data = pd.read_csv("25-3-2025_14_3/dependency_parsing_jaccard.csv")
print(svo_data.head())
print(dependency_data.head())  

                             Fourgram   Subject       Verb    Object
0    incidence rates selected cancers     rates   selected   cancers
1          rates selected cancers sex     rates   selected       sex
2  injuries accidents tional injuries  injuries  accidents  injuries
3         rates selected cancers race     rates   selected      race
4          rates adjusted normal life     rates   adjusted      life
                  Fourgram    Word Dependency    Head Head_POS
0  male female male female    male       amod  female     NOUN
1  male female male female  female       amod  female     NOUN
2  male female male female    male       amod  female     NOUN
3  male female male female  female       ROOT  female     NOUN
4  female male female male  female       amod    male     NOUN


In [11]:
def extract_svo_triples(svo_data):
    svo_triples = []
    for _, row in svo_data.iterrows():
        subject = row['Subject']
        verb = row['Verb']
        obj = row['Object']
        svo_triples.append((subject, verb, obj))
    return svo_triples

svo_triples = extract_svo_triples(svo_data)
print(svo_triples[:5])   

[('rates', 'selected', 'cancers'), ('rates', 'selected', 'sex'), ('injuries', 'accidents', 'injuries'), ('rates', 'selected', 'race'), ('rates', 'adjusted', 'life')]


In [1]:
from neo4j import GraphDatabase
import pandas as pd

uri = "bolt://localhost:7687"
username = "neo4j"
password = "Sourabh@123"  

driver = GraphDatabase.driver(uri, auth=(username, password)) 

In [2]:
def upload_svo_to_neo4j(svo_file):
    df = pd.read_csv(svo_file)

    def insert_svo(tx, subject, verb, obj):
        tx.run("""
            MERGE (s:Entity {name: $subject})
            MERGE (v:Action {name: $verb})
            MERGE (o:Entity {name: $object})
            MERGE (s)-[:PERFORMS]->(v)
            MERGE (v)-[:ACTS_ON]->(o)
        """, subject=subject, verb=verb, object=obj)

    with driver.session() as session:
        for _, row in df.iterrows():
            session.write_transaction(insert_svo, row['Subject'], row['Verb'], row['Object'])

upload_svo_to_neo4j("25-3-2025_14_3/SVO_jaccard.csv")
print("✅ SVO Triplets uploaded.") 

  session.write_transaction(insert_svo, row['Subject'], row['Verb'], row['Object'])


✅ SVO Triplets uploaded.


In [3]:
def upload_dependencies_to_neo4j(dep_file):
    df = pd.read_csv(dep_file)

    def insert_dependency(tx, word, head, dep_type):
        tx.run("""
            MERGE (w:Word {name: $word})
            MERGE (h:Word {name: $head})
            MERGE (w)-[:DEPENDS_ON {type: $dep_type}]->(h)
        """, word=word, head=head, dep_type=dep_type)

    with driver.session() as session:
        for _, row in df.iterrows():
            session.write_transaction(insert_dependency, row['Word'], row['Head'], row['Dependency'])

upload_dependencies_to_neo4j("25-3-2025_14_3/dependency_parsing_jaccard.csv")
print("✅ Dependency Parsing uploaded.") 

  session.write_transaction(insert_dependency, row['Word'], row['Head'], row['Dependency'])


✅ Dependency Parsing uploaded.
