In [None]:
import os
import re
import fitz              
import nltk
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.util import ngrams
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer      

In [None]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
input_folder = "Cancer2/"
output_folder = "25-3-2025_14_29_3_5/"

In [None]:
standard_stopwords = set(stopwords.words('english'))  

In [None]:
unwanted_words = set([
    "article","http", "et", "al", "in", "to", "and", "for", "a", "y", "is", "of", "all", "the", "from", "are", "http",
    "terms", "conditions", "publication", "citation", "open", "access", "license", "cc", "by", "creative",
    "commons", "attribution", "shown", "method", "state", "date", "plot", "trials", "per", "cent", "new",
    "present", "iii", "iv", "v", "etc", "proc", "natl", "acad", "sci", "usa", "vol", "pp", "using", "also",
    "used", "based", "may", "however", "one", "two", "three", "four", "five", "data", "set", "including",
    "due", "figure", "table", "fig", "found", "work", "among", "study", "analysis", "different", "several",
    "order", "low", "high", "higher", "lower", "within", "between", "without", "results", "approach", "across",
    "group", "suggest", "suggests", "indicate", "indicates", "according", "amongst", "even", "although", "further",
    "well", "known", "previously", "recent", "recently", "first", "second", "example", "examples", "others",
    "another", "obtained", "show", "shows", "would", "could", "can", "might", "many", "much", "certain", "some",
    "such", "particular", "often", "sometimes", "always", "never", "previous", "past", "future", "uncommon", 
    "rare", "frequent", "other", "additional", "extra", "pros", "cons", "effective", "ineffective", "efficacy", 
    "efficiency", "slow", "early", "late", "earlier", "latest", "delayed", "quick", "quicker", "quickest", "rapid", 
    "rapidly", "slowly", "gradual", "sudden", "short", "long", "shorter", "shortest", "longer", "longest", "temporary", 
    "permanent", "transient", "persistent", "mild", "moderate", "severe", "slight", "significant", "insignificant", 
    "noticeable", "unnoticeable", "detectable", "wiley", "online", "library", "downloaded", "https", "see", "rules", 
    "use", "oa", "articles", "governed", "applicable", "national", "center", "health", "statistics", "centers", 
    "mortality", "public", "tapes", "american", "society", "atlanta", "tape", "reviews", "december", "volume", 
    "nature", "publishing", "average", "annual", "percent", "change", "note", "trends", "analyzed", "joinpoint", 
    "program", "total", "leading", "ca", "clin", "j", "apc", "aapc", "ons", "mp", "cl", "ries", "lag", "eisner", 
    "naaccr", "annu", "april", "author", "available", "c", "d", "e", "g", "h", "hruban", "i", "l", "maitra", "manuscript",
    "m", "n", "p", "page", "pathol", "pmc", "r", "rev", "s", "t", "u", "area", "as", "been", "by", "has", "have", "increased",
    "induced", "international", "mg", "meter", "not", "on", "reported", "research", "square", "sources", "studies", 
    "substrate", "technol", "than", "that", "this", "topical", "under", "was", "with", "years", "younger", "journal", 
    "notes", "tropic", "tr", "de", "carlos", "o", "publishers", "limited", "explore", "factor", "final", "edited", 
    "there", "funders", "x", "na", "number", "uses", "related", "text", "mining", "ai", "training", "similar", 
    "guest", "website", "email", "info", "law", "lib", "personal", "only", "march", "february", "august", "july", 
    "october", "june", "provided", "original", "properly", "cited", "http", "visit", "please", "repository", "more", 
    "publications", "document", "identical", "content", "version", "postprint", "except", "adjusted", "normal", 
    "dimension", "greatest", "cm", "you", "will", "be", "given", "note", "note:", "yes", "no", "line", "segment", 
    "control", "strictinin", "or", "published", "academy", "advertisement", "ak", "az", "co", "dc", "begin", "blackadar", 
    "charting", "combined", "copyright", "course", "day", "delivering", "director", "each", "exclude", "expected", 
    "fl", "ga", "hi", "id", "ia", "guide", "had", "hallmarks", "hereby", "highest", "included", "inclusion", "interpreted", 
    "january", "september", "november", "marked", "model", "must", "name", "offered", "occurrence", "office", 
    "peak", "pointed", "printing", "pubmed", "rights", "rough", "should", "signs", "source", "suppl","https", "support", 
    "system", "trend", "unexpected", "updated", "versus", "yes,", "no", "guest ","http"," march"," gut", "training",
    "similar", "guest","http", "jnci" ,"no"," at ","university", "rounded ","nearest" , "excludes","basal", 
    "rounded ","nearest"," exclude "," basal","deaths each age do", "each age do sum", "age do sum ages", 
    "do sum ages combined","aapck", "abdelmohsen", "adv", "ajcc", "appropriate", "artificial", "asr","authors",
    "committee", "competing", "credit", "csc", "curr", "current", "cumulative", "declare", "edited", "elmer", 
    "error","europe", "exp", "explore", "factor", "final", "financial", "form","funders", "inovação","instituto",
    "interdiscip", "interests", "investigação", "intelligence", "jobin", "joint", "leone", "link", "manuscripts", 
    "matulonis", "medium", "nat", "official", "opin", "our", "paper","percentage", "porto", "powell", "press", "primers", 
    "provide", "rank", "schwabe", "sites","statement", "statements", "supp", "survival", "there", "were", "world",
    "o", "q", "í", "w", "f", "k", "md", "none","explor" , "ther" ,"int","abstract", "author", "manuscript", "pubmed", "copyright", "doi", 
    "journal", "page", "volume", "dataset", "support", "evidence", "available", "online", "open", "access", "introduction", "conclusion", "american", 
    "society", "united", "states", "london", "canada", "china",  "europe", "japan", "india", "harvard", "medical", "school", "boston",  "western", "fruit", ])    

In [None]:
all_stopwords = unwanted_words.union(standard_stopwords) 

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def lemmatize(tokens):
    tagged = pos_tag(tokens)
    return [lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in tagged]

In [None]:
def extract_text(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    capture = False
    for page in doc:
        page_text = page.get_text()
        if "abstract" in page_text.lower():
            capture = True
        if "references" in page_text.lower():
            capture = False
        if capture:
            text += page_text + " "
    return text.lower()   

In [None]:
global_ngram = {i: Counter() for i in range(1, 5)}
ngram_documents = {i: defaultdict(set) for i in range(1, 5)} 

In [None]:
import os, re
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import spacy
from nltk.corpus import stopwords
import fitz  # PyMuPDF for PDF text extraction
from concurrent.futures import ThreadPoolExecutor

# Load spaCy
nlp = spacy.load("en_core_web_sm")
standard_stopwords = set(stopwords.words('english'))

# Your global unwanted_words list must already be defined elsewhere
unwanted_set = set(u.strip().lower() for u in unwanted_words)
all_stopwords = unwanted_set.union(standard_stopwords)

# Input/output setup
min_word_length = 3
thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
input_folder = "Cancer2/"
output_folder = "25-3-2025_14_29_3_5/"
ngram_file_path = os.path.join(output_folder, "ngrams.txt")

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return " ".join(page.get_text("text") for page in doc)

def is_valid_alphanumeric(word):
    return bool(re.match(r'^[a-zA-Z]+\d+$', word)) or bool(re.match(r'^\d+[a-zA-Z]+$', word))

def is_broken_word(word):
    if is_valid_alphanumeric(word):
        return False
    return (
        len(word) < min_word_length or
        word.endswith('-') or
        word.isdigit() or
        not re.match(r'^[a-zA-Z0-9]+$', word)
    )

def valid_token(w):
    return not is_broken_word(w)

def clean_and_lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if valid_token(w)]
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_.lower() for token in doc]
    return [lemma for lemma in lemmas if lemma not in all_stopwords]

def is_bad_hyphenated(phrase):
    if '-' not in phrase:
        return False
    return not re.match(r'^[a-zA-Z]+\s*-\s*\d+$', phrase)

def process_file(file_path, file_name):
    print(f"Processing file: {file_name}")
    local_ngrams = defaultdict(Counter)
    ngram_docs = defaultdict(set)

    raw_text = extract_text_from_pdf(file_path)
    if not raw_text:
        return local_ngrams, ngram_docs, file_name

    tokens = clean_and_lemmatize(raw_text)

    for n in range(1, 5):
        for ng in ngrams(tokens, n):
            phrase = " ".join(ng)
            if phrase not in unwanted_set and not is_bad_hyphenated(phrase):
                local_ngrams[n][ng] += 1
                ngram_docs[n].add(ng)

    return local_ngrams, ngram_docs, file_name

def process_pdfs_parallel(input_folder):
    global_ngram = defaultdict(Counter)
    ngram_documents = defaultdict(lambda: defaultdict(set))

    tasks = []
    with ThreadPoolExecutor() as executor:
        for file_name in os.listdir(input_folder):
            if file_name.endswith(".pdf"):
                file_path = os.path.join(input_folder, file_name)
                tasks.append(executor.submit(process_file, file_path, file_name))

        for task in tasks:
            local_ngrams, ngram_docs, file_name = task.result()
            for n in range(1, 5):
                global_ngram[n].update(local_ngrams[n])
                for ng in ngram_docs[n]:
                    ngram_documents[n][ng].add(file_name)

    os.makedirs(output_folder, exist_ok=True)

    for n in range(1, 5):
        with open(os.path.join(output_folder, f"ngram_{n}.txt"), "w", encoding="utf-8") as f:
            for ngram, count in global_ngram[n].items():
                f.write(f"{' '.join(ngram)}: {count}\n")

    with open(ngram_file_path, "w", encoding="utf-8") as f:
        for n in range(1, 5):
            f.write(f"\n{n}-grams (≥{thresholds[n]} times & in ≥5 documents):\n")
            for ngram, count in global_ngram[n].items():
                if count >= thresholds[n] and len(ngram_documents[n][ngram]) >= 5:
                    f.write(f"{' '.join(ngram)}: {count}\n")
            f.write("\n" + "=" * 50 + "\n")

# Run
process_pdfs_parallel(input_folder) 

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter   

In [None]:
thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0  

In [None]:
file_path = "25-3-2025_14_29_3/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)  

In [None]:
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines() 

In [None]:
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

In [None]:
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Remove counts of sub-grams from higher-order n-grams
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count  

In [None]:
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({k: v for k, v in ngram_data[n].items() if v > 0})

# Function to check if an n-gram contains alphanumeric words (e.g., p53, cyp2d6)
def contains_valid_alphanum(ngram):
    # Regex checks if word is alphanumeric: either letters followed by digits or digits followed by letters
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

# Apply thresholds but retain alphanumeric n-grams regardless of frequency
for n in ngram_data:
    threshold = thresholds[n]
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= threshold or contains_valid_alphanum(k)  # Keep alphanumeric n-grams even if below threshold
    })

# Write the cleaned n-grams to text and CSV files
txt_path = os.path.join(output_dir, "clean_ngram.txt")
csv_path = os.path.join(output_dir, "clean_ngram.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])   

In [None]:
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Top 1500 unigrams:\n")
        for ngram, count in filtered_ngrams[1].most_common(top_n):     
            f.write(f"{' '.join(ngram)}: {count}\n")
        f.write("Top 1500 Bigrams:\n")
        for ngram, count in filtered_ngrams[2].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Trigrams:\n")
        for ngram, count in filtered_ngrams[3].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Fourgrams:\n")
        for ngram, count in filtered_ngrams[4].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

# Save the top 1500 n-grams
top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("Cleaned and top n-gram files saved.")  

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter    

thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0   

file_path = "25-3-2025_14_29_3/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)   

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()  

# Process the n-grams from the file
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

# Function to generate sub-grams
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Remove counts of sub-grams from higher-order n-grams
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count  

# Remove zero or negative frequencies
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({k: v for k, v in ngram_data[n].items() if v > 0})

# Function to keep alphanumeric n-grams even if below the threshold
def contains_valid_alphanum(ngram):
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

# Apply thresholds but retain alphanumeric n-grams regardless of frequency
for n in ngram_data:
    threshold = thresholds[n]
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= threshold or contains_valid_alphanum(k)
    })

# Write the cleaned n-grams to text and CSV files
txt_path = os.path.join(output_dir, "clean_ngram.txt")
csv_path = os.path.join(output_dir, "clean_ngram.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])   

# Function to save the top 1500 n-grams
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Top 1500 unigrams:\n")
        # Always include alphanumeric n-grams in the top 1500 list
        for ngram, count in filtered_ngrams[1].most_common(top_n):     
            f.write(f"{' '.join(ngram)}: {count}\n")
        f.write("Top 1500 Bigrams:\n")
        for ngram, count in filtered_ngrams[2].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Trigrams:\n")
        for ngram, count in filtered_ngrams[3].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Fourgrams:\n")
        for ngram, count in filtered_ngrams[4].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

# Save the top 1500 n-grams
top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("Cleaned and top n-gram files saved.")   

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter    

thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0   

file_path = "25-3-2025_14_29_3/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)   

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()  

# Process the n-grams from the file
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

# Function to generate sub-grams
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Remove counts of sub-grams from higher-order n-grams
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count  

# Remove zero or negative frequencies
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({k: v for k, v in ngram_data[n].items() if v > 0})

# Function to check for alphanumeric n-grams (must always be kept)
def contains_valid_alphanum(ngram):
    # Accept mixed alphanumeric words like TP53, IL-6, CD4+, BRCA1/2, H3K27me3
    return any(
        re.search(r'[a-zA-Z]', word) and re.search(r'[\d/+.-]', word)
        for word in ngram
    )

# Apply thresholds but retain alphanumeric n-grams regardless of frequency
for n in ngram_data:
    threshold = thresholds[n]
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= threshold or contains_valid_alphanum(k)  # Keep alphanumeric n-grams regardless of frequency
    })

# Write the cleaned n-grams to text and CSV files
txt_path = os.path.join(output_dir, "clean_ngram.txt")
csv_path = os.path.join(output_dir, "clean_ngram.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])   

# Function to save the top 1500 n-grams
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Top 1500 unigrams:\n")
        # Always include alphanumeric n-grams in the top 1500 list
        for ngram, count in filtered_ngrams[1].most_common(top_n):     
            f.write(f"{' '.join(ngram)}: {count}\n")
        f.write("Top 1500 Bigrams:\n")
        for ngram, count in filtered_ngrams[2].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Trigrams:\n")
        for ngram, count in filtered_ngrams[3].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

        f.write("\nTop 1500 Fourgrams:\n")
        for ngram, count in filtered_ngrams[4].most_common(top_n):
            f.write(f"{' '.join(ngram)}: {count}\n")

# Save the top 1500 n-grams
top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("Cleaned and top n-gram files saved.") 

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter    

thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0   

file_path = "25-3-2025_14_29_3/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)   

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()  

# Process the n-grams from the file
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

# Function to generate sub-grams
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Function to detect alphanumerical tokens (like "p53", "H1N1")
def contains_valid_alphanum(ngram):
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

# Subtract higher-order n-grams from lower-order ones
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):               
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count

# Clean: Keep only positive counts or alphanumerics even if ≤ 0
for n in list(ngram_data.keys()):
    updated = Counter()
    for k, v in ngram_data[n].items():
        if v > 0 or contains_valid_alphanum(k):  # Protect alphanumeric n-grams even if their count is ≤ 0
            if v <= 0 and contains_valid_alphanum(k):
                v = original_ngram_data[n][k]  # Restore original frequency for alphanumeric n-grams
            updated[k] = v
    ngram_data[n] = updated

# Apply frequency thresholds but keep alphanumerics
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= thresholds[n] or contains_valid_alphanum(k)  # Always keep alphanumeric n-grams
    })

# Write cleaned n-grams to files
txt_path = os.path.join(output_dir, "clean_ngram_2.txt")
csv_path = os.path.join(output_dir, "clean_ngram_2.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])

# Save top 1500 n-grams (with alphanumeric protection) per n-level
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram_2.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        for n in range(1, 5):
            f.write(f"\nTop {top_n} {['Unigrams', 'Bigrams', 'Trigrams', 'Fourgrams'][n-1]}:\n")
            top_ngrams = filtered_ngrams[n].most_common(top_n)
            seen = set(ngram for ngram, _ in top_ngrams)
            alphanum_ngrams = [
                (ngram, count) for ngram, count in filtered_ngrams[n].items()
                if contains_valid_alphanum(ngram) and ngram not in seen
            ]
            combined = top_ngrams + alphanum_ngrams
            for ngram, count in combined:
                f.write(f"{' '.join(ngram)}: {count}\n")

top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram_2.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("✅ Cleaned and top n-gram files saved with alphanumeric protection.")


In [3]:
#dfghjkl;kjhugvb nm, kjhugv nmjhgfcv bn,mkjhb nbhgv ccxzcdcyvv

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter    

thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0   

file_path = "25-3-2025_14_29_3/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)   

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()  

# Process the n-grams from the file
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

# Function to generate sub-grams
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Function to detect alphanumerical tokens (like "p53", "H1N1")
def contains_valid_alphanum(ngram):
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

# Subtract higher-order n-grams from lower-order ones
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count

# Clean: Keep only positive counts or alphanumerics even if ≤ 0
for n in list(ngram_data.keys()):
    updated = Counter()
    for k, v in ngram_data[n].items():
        if v > 0 or contains_valid_alphanum(k):  # Protect alphanumeric n-grams even if their count is ≤ 0
            # Ensure alphanumeric n-grams retain their original frequency if modified
            if v <= 0 and contains_valid_alphanum(k):
                v = ngram_data[n][k]  # Keep original count for alphanumeric n-grams
            updated[k] = v
    ngram_data[n] = updated

# Apply frequency thresholds but keep alphanumerics
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= thresholds[n] or contains_valid_alphanum(k)  # Always keep alphanumeric n-grams
    })

# Write cleaned n-grams to files
txt_path = os.path.join(output_dir, "clean_ngram_2.txt")
csv_path = os.path.join(output_dir, "clean_ngram_2.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])

# Save top 1500 n-grams (with alphanumeric protection) per n-level
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram_2.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        for n in range(1, 5):
            f.write(f"\nTop {top_n} {['Unigrams', 'Bigrams', 'Trigrams', 'Fourgrams'][n-1]}:\n")
            top_ngrams = filtered_ngrams[n].most_common(top_n)
            seen = set(ngram for ngram, _ in top_ngrams)
            alphanum_ngrams = [
                (ngram, count) for ngram, count in filtered_ngrams[n].items()
                if contains_valid_alphanum(ngram) and ngram not in seen
            ]
            combined = top_ngrams + alphanum_ngrams
            for ngram, count in combined:
                f.write(f"{' '.join(ngram)}: {count}\n")

top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram_2.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("✅ Cleaned and top n-gram files saved with alphanumeric protection.")   

In [None]:
import os
import re
import csv
from collections import defaultdict, Counter    

thresholds = {1: 150, 2: 100, 3: 54, 4: 25}
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0   

file_path = "25-3-2025_14_29_3_5/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)   

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()  

# Process the n-grams from the file
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue    

# Function to generate sub-grams
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

# Function to detect alphanumerical tokens (like "p53", "H1N1")
def contains_valid_alphanum(ngram):
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

# Track alphanumeric n-grams separately to ensure they are never modified or removed
protected_ngrams = defaultdict(Counter)

# Subtract higher-order n-grams from lower-order ones
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count

# Store the original counts of alphanumeric n-grams
for n in list(ngram_data.keys()):
    for k, v in ngram_data[n].items():
        if contains_valid_alphanum(k):
            protected_ngrams[n][k] = v  # Save the original count for alphanumeric n-grams

# Clean: Remove negative counts but protect alphanumeric n-grams
for n in list(ngram_data.keys()):
    updated = Counter()
    for k, v in ngram_data[n].items():
        if v > 0 or contains_valid_alphanum(k):  # Protect alphanumeric n-grams even if their count is ≤ 0
            updated[k] = v
    ngram_data[n] = updated

# Apply frequency thresholds but keep alphanumerics intact
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= thresholds[n] or contains_valid_alphanum(k)  # Always keep alphanumeric n-grams
    })

# Reintroduce protected alphanumeric n-grams with their original counts (if not already present)
for n in list(ngram_data.keys()):
    for k, v in protected_ngrams[n].items():
        if k not in ngram_data[n]:
            ngram_data[n][k] = v  # Restore the original count

# Write cleaned n-grams to files
txt_path = os.path.join(output_dir, "clean_ngram_2.txt")
csv_path = os.path.join(output_dir, "clean_ngram_2.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])

# Save top 1500 n-grams (with alphanumeric protection) per n-level
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram_2.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        for n in range(1, 5):
            f.write(f"\nTop {top_n} {['Unigrams', 'Bigrams', 'Trigrams', 'Fourgrams'][n-1]}:\n")
            top_ngrams = filtered_ngrams[n].most_common(top_n)
            seen = set(ngram for ngram, _ in top_ngrams)
            alphanum_ngrams = [
                (ngram, count) for ngram, count in filtered_ngrams[n].items()
                if contains_valid_alphanum(ngram) and ngram not in seen
            ]
            combined = top_ngrams + alphanum_ngrams
            for ngram, count in combined:
                f.write(f"{' '.join(ngram)}: {count}\n")

top_ngrams_1500_path = os.path.join(output_dir, "top_1500_ngram_2.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_1500_path, top_n=1500)

print("✅ Cleaned and top n-gram files saved with alphanumeric protection.")   

In [4]:
import os
import re
import csv
from collections import defaultdict, Counter

# Priority Cancer Keywords
priority_keywords = set([
    "cancer", "tumor", "mutation", "gene", "treatment", "chemotherapy", "radiation", 
    "immunotherapy", "biopsy", "diagnosis", "survival", "metastasis", "cell", "therapy", 
    "drug", "prognosis", "malignancy", "oncologist", "carcinoma"
])

# Helper Functions
def get_subgrams(ngram, sub_len):
    return [tuple(ngram[i:i+sub_len]) for i in range(len(ngram) - sub_len + 1)]

def contains_valid_alphanum(ngram):
    return any(re.match(r'^[a-zA-Z]+\d+$', word) or re.match(r'^\d+[a-zA-Z]+$', word) for word in ngram)

def contains_priority_keyword(ngram):
    return any(word.lower() in priority_keywords for word in ngram)

# Thresholds
thresholds = {1: 150, 2: 100, 3: 54, 4: 25}

# Setup
ngram_data = defaultdict(Counter)
pattern = re.compile(r'^(\d+)-grams.*:$')
current_n = 0

file_path = "25-3-2025_14_29_3_5/ngrams.txt"
output_dir = "25-3-2025_14_29_3_5/"
os.makedirs(output_dir, exist_ok=True)

# Read n-grams from file
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Process the n-grams
for line in lines:
    line = line.strip()
    if not line:
        continue
    match = pattern.match(line)
    if match:
        current_n = int(match.group(1))
        continue
    if current_n > 0 and ':' in line:
        try:
            ngram_str, count = line.rsplit(':', 1)
            ngram_tuple = tuple(ngram_str.strip().split())
            ngram_data[current_n][ngram_tuple] += int(count.strip())
        except ValueError:
            continue

# Track protected alphanumeric ngrams
protected_ngrams = defaultdict(Counter)

# Subtract higher-order n-grams from lower-order
for higher_n in range(4, 1, -1):
    for higher_ngram, higher_count in ngram_data[higher_n].items():
        for sub_len in range(1, higher_n):
            for sub_ngram in get_subgrams(higher_ngram, sub_len):
                if sub_ngram in ngram_data[sub_len]:
                    ngram_data[sub_len][sub_ngram] -= higher_count

# Save protected alphanumerics
for n in list(ngram_data.keys()):
    for k, v in ngram_data[n].items():
        if contains_valid_alphanum(k):
            protected_ngrams[n][k] = v

# Clean: remove negatives but keep alphanumerics and priority keywords
for n in list(ngram_data.keys()):
    updated = Counter()
    for k, v in ngram_data[n].items():
        if v > 0 or contains_valid_alphanum(k) or contains_priority_keyword(k):
            updated[k] = v
    ngram_data[n] = updated

# Apply thresholds but keep important n-grams
for n in list(ngram_data.keys()):
    ngram_data[n] = Counter({
        k: v for k, v in ngram_data[n].items()
        if v >= thresholds[n] or contains_valid_alphanum(k) or contains_priority_keyword(k)
    })

# Restore protected alphanumerics if missing
for n in list(ngram_data.keys()):
    for k, v in protected_ngrams[n].items():
        if k not in ngram_data[n]:
            ngram_data[n][k] = v

# Write cleaned n-grams
txt_path = os.path.join(output_dir, "clean_ngram_2.txt")
csv_path = os.path.join(output_dir, "clean_ngram_2.csv")

with open(txt_path, "w", encoding="utf-8") as txt_out, open(csv_path, "w", newline="", encoding="utf-8") as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(["N-gram", "Frequency"])
    for n in sorted(ngram_data):
        txt_out.write(f"\n{n}-grams:\n")
        for ngram, count in ngram_data[n].most_common():
            txt_out.write(f"{' '.join(ngram)}: {count}\n")
            writer.writerow([" ".join(ngram), count])

# Save Top 1500 per ngram level (keeping priority and alphanumerics)
def save_top_clean_ngrams(filtered_ngrams, filename="top_1500_ngram-2_priority.txt", top_n=1500):
    with open(filename, "w", encoding="utf-8") as f:
        for n in range(1, 5):
            f.write(f"\nTop {top_n} {['Unigrams', 'Bigrams', 'Trigrams', 'Fourgrams'][n-1]}:\n")
            top_ngrams = filtered_ngrams[n].most_common(top_n)
            seen = set(ngram for ngram, _ in top_ngrams)
            alphanum_priority_ngrams = [
                (ngram, count) for ngram, count in filtered_ngrams[n].items()
                if (contains_valid_alphanum(ngram) or contains_priority_keyword(ngram)) and ngram not in seen
            ]
            combined = top_ngrams + alphanum_priority_ngrams
            for ngram, count in combined:
                f.write(f"{' '.join(ngram)}: {count}\n")

top_ngrams_priority_path = os.path.join(output_dir, "top_1500_ngram-2_priority.txt")
save_top_clean_ngrams(ngram_data, filename=top_ngrams_priority_path, top_n=1500)

print("✅ Cleaned and Top Priority n-gram files saved!") 

✅ Cleaned and Top Priority n-gram files saved!


In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

ngram_txt_path = "25-3-2025_14_29_3_5/top_1500_ngram_2.txt"
paper_folder = "Cancer2/"
output_file = "25-3-2025_14_29_3_5/tfidf_ngrams_papers.csv"

# Read the ngrams from the text file
with open(ngram_txt_path, 'r') as file:
    ngram_lines = file.readlines()

# Extract unigrams (ignoring the frequency part)
cleaned_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line]

# If the cleaned_ngrams is empty, use the raw ngram lines
if not cleaned_ngrams:
    cleaned_ngrams = [line.strip() for line in ngram_lines]

# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")  # Print which paper is being processed
    reader = PdfReader(pdf_path)
    return " ".join(page.extract_text() or "" for page in reader.pages)

# Get list of PDF files in the folder
pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]

# Use ThreadPoolExecutor to extract text in parallel
with ThreadPoolExecutor() as executor:
    documents = list(executor.map(extract_text_from_pdf, pdf_paths))

# Vectorize the text data using the cleaned unigrams
vectorizer = TfidfVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 1), lowercase=True)
tfidf_matrix = vectorizer.fit_transform(documents)

# Create DataFrame for easy visualization and saving
df = pd.DataFrame(tfidf_matrix.T.toarray(), index=cleaned_ngrams, columns=pdf_files)

# Save the TF-IDF matrix to a CSV file
df.to_csv(output_file)

print(f"TF-IDF matrix saved at: {output_file}") 

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

# === Paths ===
ngram_txt_path = "25-3-2025_14_29_3_5/top_1500_ngram_2.txt"
paper_folder = "Cancer2/"
output_file = "25-3-2025_14_29_3_5/tfidf_ngrams_papers.csv"

# === Read and clean n-grams from the text file ===
with open(ngram_txt_path, 'r', encoding='utf-8') as file:
    ngram_lines = file.readlines()

# Extract n-gram phrases (remove counts and unwanted lines)
cleaned_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]

if not cleaned_ngrams:
    cleaned_ngrams = [line.strip() for line in ngram_lines if line.strip()]

# === Function to extract text from a single PDF ===
def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    try:
        reader = PdfReader(pdf_path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === Get all PDF paths ===
pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]

# === Extract PDF text using threads for speed ===
with ThreadPoolExecutor() as executor:
    documents = list(executor.map(extract_text_from_pdf, pdf_paths))

# === TF-IDF Vectorization using 1 to 4-gram matching ===
vectorizer = TfidfVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)
tfidf_matrix = vectorizer.fit_transform(documents)

# === Create and save DataFrame ===
df = pd.DataFrame(tfidf_matrix.T.toarray(), index=cleaned_ngrams, columns=pdf_files)
df.to_csv(output_file)

print(f"\n✅ TF-IDF matrix saved at: {output_file}") 

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from PyPDF2 import PdfReader

# === Paths ===
paper_folder = "Cancer2/"  # Folder containing all the PDF papers
ngram_txt_path = "25-3-2025_14_29_3/top_1500_ngram.txt"  # Your n-grams file

# === Function to extract text from a single PDF ===
def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    try:
        reader = PdfReader(pdf_path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === Get all PDF paths ===
pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]

# === Extract text from all PDFs ===
documents = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_paths]

# === Read and clean n-grams from the text file ===
with open(ngram_txt_path, 'r', encoding='utf-8') as file:
    ngram_lines = file.readlines()

# Extract n-gram phrases (remove counts and unwanted lines)
cleaned_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]

if not cleaned_ngrams:
    cleaned_ngrams = [line.strip() for line in ngram_lines if line.strip()]

# === Create CountVectorizer to count the occurrences of n-grams in the documents ===
count_vectorizer = CountVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)

# Transform the documents into the count matrix (n-gram x paper)
count_matrix = count_vectorizer.transform(documents)

# === Create a DataFrame for the n-gram x paper matrix ===
count_df = pd.DataFrame(count_matrix.toarray(), index=cleaned_ngrams, columns=pdf_files)

# === Save the n-gram x paper matrix to CSV ===
output_file = "25-3-2025_14_29_3/ngram_count_per_paper.csv"
count_df.to_csv(output_file)

print(f"\n✅ N-gram count per paper saved at: {output_file}")   

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

ngram_txt_path = "25-3-2025_14_29_3_5/top_1500_ngram_2.txt"
paper_folder = "Cancer2/"
output_file = "25-3-2025_14_29_3_5/tfidf_ngrams_papers.csv"

with open(ngram_txt_path, 'r') as file:
    ngram_lines = file.readlines()

cleaned_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line]
if not cleaned_ngrams:
    cleaned_ngrams = [line.strip() for line in ngram_lines]

alphanum_ngrams = [term for term in cleaned_ngrams if any(c.isdigit() for c in term)]
print("Alphanumerical n-grams found in vocabulary:", alphanum_ngrams)

def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    reader = PdfReader(pdf_path)
    return " ".join(page.extract_text() or "" for page in reader.pages)

pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]

with ThreadPoolExecutor() as executor:
    documents = list(executor.map(extract_text_from_pdf, pdf_paths))

vectorizer = TfidfVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 1), lowercase=False)
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.T.toarray(), index=cleaned_ngrams, columns=pdf_files)
df.to_csv(output_file)

print(f"TF-IDF matrix saved at: {output_file}") 

In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import CountVectorizer
import re

paper_folder = "Cancer2/"
ngram_txt_path = "25-3-2025_14_29_3_5/filtered_ngrams_with_frequency.txt"
output_file = "25-3-2025_14_29_3/ngram_count_per_paper.csv"
transposed_output_file = "25-3-2025_14_29_3_5/ngram_count_transposed.csv"

def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    try:
        reader = PdfReader(pdf_path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]
documents = [extract_text_from_pdf(path) for path in pdf_paths]

with open(ngram_txt_path, 'r', encoding='utf-8') as file:
    ngram_lines = file.readlines()

raw_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]
if not raw_ngrams:
    raw_ngrams = [line.strip() for line in ngram_lines if line.strip()]

def is_alphanumeric_ngram(ngram):
    return all(re.fullmatch(r"[a-zA-Z0-9]+", token) for token in ngram.split())

cleaned_ngrams = [ng for ng in raw_ngrams if is_alphanumeric_ngram(ng)]

count_vectorizer = CountVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)
count_matrix = count_vectorizer.transform(documents)

count_df = pd.DataFrame(count_matrix.toarray(), index=pdf_files, columns=cleaned_ngrams)

count_df.to_csv(output_file)
print(f"\n✅ N-gram count per paper saved at: {output_file}")

count_df.transpose().to_csv(transposed_output_file)
print(f"📄 Transposed matrix saved at: {transposed_output_file}") 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

count_vectorizer = CountVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)
count_matrix = count_vectorizer.fit_transform(documents)

count_df = pd.DataFrame(count_matrix.toarray().T, index=cleaned_ngrams, columns=pdf_files)

output_file = "25-3-2025_14_29_3_5/ngram_count_per_paper.csv"
count_df.to_csv(output_file)

print(f"\n✅ N-gram count matrix saved at: {output_file}")    

In [None]:
import os
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text

def build_sparse_cooccurrence_csv(input_folder, ngram_file, output_file):
    # Read the n-grams and their frequencies from the ngram_file
    ngrams_dict = {}
    with open(ngram_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line and ": " in line:
                ngram, count = line.split(": ")
                ngrams_dict[ngram] = int(count)

    # List all PDF files in the input folder
    paper_files = [f for f in os.listdir(input_folder) if f.endswith(".pdf")]
    cooccurrence_data = []
    stop_words = set(stopwords.words("english"))

    # Process each paper (PDF file) in the input folder
    for file_name in paper_files:
        pdf_path = os.path.join(input_folder, file_name)
        raw_text = extract_text(pdf_path)

        if not raw_text.strip():
            print(f"Skipped (no extractable text): {file_name}")
            continue

        # Tokenize the text and remove stopwords
        tokens = [word for word in word_tokenize(raw_text.lower()) if word.isalpha() and word not in stop_words]
        
        # Count n-grams in the document
        paper_ngrams = Counter()
        for ngram in ngrams_dict:
            ngram_tokens = ngram.split()
            # Check how many times this n-gram appears in the paper
            for i in range(len(tokens) - len(ngram_tokens) + 1):
                if tokens[i:i+len(ngram_tokens)] == ngram_tokens:
                    paper_ngrams[ngram] += 1

        # Add co-occurrence data for each found n-gram
        for ngram, count in paper_ngrams.items():
            if count > 0:
                cooccurrence_data.append((ngram, file_name, count))

        print(f"Processed: {file_name}")

    # Save the co-occurrence matrix to CSV
    df_sparse = pd.DataFrame(cooccurrence_data, columns=["Ngram", "Paper", "Count"])
    df_sparse.to_csv(output_file, index=False)
    print(f"Co-occurrence matrix saved to: {output_file}")

# Example usage:
# Specify the input folder containing PDF files, ngram file, and output file
input_folder = "Cancer2/"
ngram_file = "25-3-2025_14_29_3_5/top_1500_ngram.txt"
output_file = "25-3-2025_14_29_3_5/fourgram_cooccurrence.csv"

# Run the function
build_sparse_cooccurrence_csv(input_folder, ngram_file, output_file) 

In [None]:
import pandas as pd

# Load the existing ngram count per paper matrix
count_df = pd.read_csv("25-3-2025_14_29_3_5/ngram_count_per_paper.csv", index_col=0)

# Convert to binary presence (1 if appears in paper, 0 otherwise)
binary_df = count_df.gt(0).astype(int)

# Get total number of papers in which each n-gram appears (row-wise sum)
ngram_doc_freq = binary_df.sum(axis=1)

# Broadcast to get Jaccard-like matrix: each value = 1 / number of papers where ngram appears (if it appears in that paper)
jaccard_matrix = binary_df.div(ngram_doc_freq, axis=0)

# Save the Jaccard matrix
output_path = "25-3-2025_14_29_3/ngram_jaccard_similarity_ngram_x_paper.csv"
jaccard_matrix.to_csv(output_path)

print(f"✅ Jaccard similarity matrix (ngram x paper) saved at: {output_path}") 

In [None]:
# Load the ngram x paper count matrix (if not already loaded)
count_df = pd.read_csv("25-3-2025_14_29_3_5/ngram_count_per_paper.csv", index_col=0)

# Compute total frequency of each ngram across all papers
ngram_total_freq = count_df.sum(axis=1)

# Save as CSV
ngram_total_freq.to_csv("25-3-2025_14_29_3_5/ngram_total_frequency.csv", header=["total_frequency"])

print("✅ Total frequency per n-gram saved at: 25-3-2025_14_29_3_5/ngram_total_frequency.csv") 

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("25-3-2025_14_29_3_5/ngram_total_frequency.csv")

# Convert the total_frequency column to numeric, forcing errors to NaN for non-numeric rows
df['total_frequency'] = pd.to_numeric(df['total_frequency'], errors='coerce')

# Drop rows with NaN in total_frequency
df = df.dropna(subset=['total_frequency'])

# Filter rows where total_frequency > 15
filtered_df = df[df['total_frequency'] > 15]

# Save to a new CSV file
filtered_df.to_csv("25-3-2025_14_29_3_5/ngrams_above_15.csv", index=False)   

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from PyPDF2 import PdfReader

# === Paths ===
paper_folder = "Cancer2/"
ngram_txt_path = "25-3-2025_14_29_3_5/ngrams_above_15.csv"
output_file = "25-3-2025_14_29_3_5/ngram_count_per_paper_hjjnjknmmkmlxyuy.csv"

# === Function to extract text from a single PDF ===
def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    try:
        reader = PdfReader(pdf_path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === Load PDF texts ===
pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]
documents = [extract_text_from_pdf(path) for path in pdf_paths]

# === Load and clean n-grams ===
with open(ngram_txt_path, 'r', encoding='utf-8') as f:
    ngram_lines = f.readlines()
cleaned_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]
if not cleaned_ngrams:
    cleaned_ngrams = [line.strip() for line in ngram_lines if line.strip()]

# === Count n-gram occurrences ===
vectorizer = CountVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)
matrix = vectorizer.transform(documents)  # shape: [num_docs x num_ngrams]

# === Convert to DataFrame: rows=ngrams, columns=pdfs ===
count_df = pd.DataFrame(matrix.toarray().T, index=cleaned_ngrams, columns=pdf_files)

# === Save ===
count_df.to_csv(output_file)
print(f"\n✅ N-gram count matrix saved at: {output_file}")  

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('25-3-2025_14_29_3/ngram_jaccard_similarity_ngram_x_paper.csv')

# Transpose the DataFrame
df_transposed = df.T

# Save the transposed DataFrame to a new CSV
df_transposed.to_csv('25-3-2025_14_29_3/transposed_file.csv', header=False, index=False)

In [None]:
import pandas as pd

# === Paths ===
jaccard_csv_path = "25-3-2025_14_29_3/ngram_jaccard_similarity_ngram_x_paper.csv"
output_path = "25-3-2025_14_29_3_5/ngram_combined_jaccard.csv"

# === Load Jaccard similarity matrix (n-gram × paper) ===
jaccard_df = pd.read_csv(jaccard_csv_path, index_col=0)

# === Load list of filtered n-grams ===
with open(filtered_ngrams_txt_path, 'r', encoding='utf-8') as f:
    ngrams = [line.strip().split(":")[0] for line in f if line.strip()]

# === Clean index and match only those in both files ===
jaccard_df.index = jaccard_df.index.str.strip()
ngrams_set = set(ngrams)
valid_ngrams = jaccard_df.index.intersection(ngrams_set)
filtered_jaccard_df = jaccard_df.loc[valid_ngrams]

# === Compute average Jaccard per n-gram across all papers ===
filtered_jaccard_df["Combined_Jaccard"] = filtered_jaccard_df.mean(axis=1)

# === Save only the combined column ===
filtered_jaccard_df[["Combined_Jaccard"]].to_csv(output_path)
print(f"✅ Combined Jaccard scores saved at: {output_path}")  

In [None]:
import os
import re
import pandas as pd

# === Paths ===
ngram_txt_path = "25-3-2025_14_29_3_5/cleaned_ngrams.csv"
output_file = "25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv"

# === Load and clean n-grams ===
with open(ngram_txt_path, 'r', encoding='utf-8') as f:
    ngram_lines = f.readlines()

# Extract n-grams from the file
raw_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]
if not raw_ngrams:
    raw_ngrams = [line.strip() for line in ngram_lines if line.strip()]

# === Filter only alphanumeric ngrams ===
def is_alphanumeric_ngram(ngram):
    return all(re.fullmatch(r"[a-zA-Z0-9]+", token) for token in ngram.split())

cleaned_ngrams = [ng for ng in raw_ngrams if is_alphanumeric_ngram(ng)]

# === Jaccard Similarity Function ===
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

# === Calculate Jaccard similarity matrix ===
ngram_sets = [set(ngram.split()) for ngram in cleaned_ngrams]  # Split each n-gram into a set of tokens
jaccard_matrix = []

for i, set1 in enumerate(ngram_sets):
    row = []
    for j, set2 in enumerate(ngram_sets):
        similarity = jaccard_similarity(set1, set2)
        row.append(similarity)
    jaccard_matrix.append(row)

# === Create DataFrame for Jaccard similarities ===
# The index and columns are both the n-grams themselves
jaccard_df = pd.DataFrame(jaccard_matrix, index=cleaned_ngrams, columns=cleaned_ngrams)

# === Save the Jaccard similarity matrix ===
jaccard_df.to_csv(output_file)
print(f"\n✅ Jaccard similarity matrix saved at: {output_file}") 

In [None]:
import re
import pandas as pd

# === Paths ===
ngram_csv_path = "25-3-2025_14_29_3_5/cleaned_ngrams.csv"
output_file = "25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv"

# === Load ngrams from CSV ===
df = pd.read_csv(ngram_csv_path)

# Rename if needed (in case it's still 'Unnamed: 0')
if 'Unnamed: 0' in df.columns:
    df.rename(columns={'Unnamed: 0': 'ngram'}, inplace=True)

# Ensure 'ngram' column exists
if 'ngram' not in df.columns:
    raise KeyError("❌ Column 'ngram' not found in the CSV!")

# === Filter only alphanumeric ngrams ===
def is_alphanumeric_ngram(ngram):
    return all(re.fullmatch(r"[a-zA-Z0-9]+", token) for token in ngram.split())

cleaned_ngrams = df['ngram'].dropna().unique().tolist()
cleaned_ngrams = [ng for ng in cleaned_ngrams if is_alphanumeric_ngram(ng)]

# === Jaccard Similarity Function ===
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

# === Compute Jaccard similarity matrix ===
ngram_sets = [set(ng.split()) for ng in cleaned_ngrams]
jaccard_matrix = [
    [jaccard_similarity(set1, set2) for set2 in ngram_sets]
    for set1 in ngram_sets
]

# === Save to CSV ===
jaccard_df = pd.DataFrame(jaccard_matrix, index=cleaned_ngrams, columns=cleaned_ngrams)
jaccard_df.to_csv(output_file)

print(f"✅ Jaccard similarity matrix saved at: {output_file}") 

In [None]:
import os
import re
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from PyPDF2 import PdfReader
from spacy.matcher import Matcher

# === Paths ===
paper_folder = "Cancer2/"
ngram_txt_path = "25-3-2025_14_29_3_5/top_1500_ngram_2.txt"
output_file = "25-3-2025_14_29_3/ngram_jaccard_similarity_ngram_x_paper.csv"
threshold = 15

# === Initialize spaCy model ===
nlp = spacy.load("en_core_web_sm")

# === Function to extract text from a single PDF ===
def extract_text_from_pdf(pdf_path):
    print(f"Processing {os.path.basename(pdf_path)}...")
    try:
        reader = PdfReader(pdf_path)
        return " ".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# === Load PDF texts ===
pdf_files = [file for file in os.listdir(paper_folder) if file.lower().endswith(".pdf")]
pdf_paths = [os.path.join(paper_folder, file) for file in pdf_files]
documents = [extract_text_from_pdf(path) for path in pdf_paths]

# === Load and clean n-grams ===
with open(ngram_txt_path, 'r', encoding='utf-8') as f:
    ngram_lines = f.readlines()
raw_ngrams = [line.split(":")[0].strip() for line in ngram_lines if ":" in line and line.strip()]
if not raw_ngrams:
    raw_ngrams = [line.strip() for line in ngram_lines if line.strip()]

# === Filter only alphanumeric ngrams ===
def is_alphanumeric_ngram(ngram):
    return all(re.fullmatch(r"[a-zA-Z0-9]+", token) for token in ngram.split())

cleaned_ngrams = [ng for ng in raw_ngrams if is_alphanumeric_ngram(ng)]

# === Count n-gram occurrences ===
count_vectorizer = CountVectorizer(vocabulary=cleaned_ngrams, ngram_range=(1, 4), lowercase=True)
count_matrix = count_vectorizer.transform(documents)

# === Create DataFrame for n-gram counts ===
count_df = pd.DataFrame(count_matrix.toarray(), index=pdf_files, columns=cleaned_ngrams)

# === Sum the total frequency of each n-gram across all papers ===
ngram_total_freq = count_df.sum(axis=0)

# === Filter n-grams above the threshold ===
filtered_ngrams = ngram_total_freq[ngram_total_freq >= threshold].index.tolist()

# === Rebuild CountVectorizer and transform documents with filtered n-grams ===
filtered_vectorizer = CountVectorizer(vocabulary=filtered_ngrams, ngram_range=(1, 4), lowercase=True)
filtered_matrix = filtered_vectorizer.transform(documents)

# === Create filtered DataFrame ===
filtered_count_df = pd.DataFrame(filtered_matrix.toarray(), index=pdf_files, columns=filtered_ngrams)

# === Save the filtered n-gram count matrix ===
filtered_count_df.to_csv(output_file)
print(f"\n✅ N-gram count per paper (filtered) saved at: {output_file}")

# === Dependency Parsing and Relationship Extraction ===

# Define patterns for medical relations (can be customized based on domain-specific needs)   
def extract_medical_relations(doc):
    matcher = Matcher(nlp.vocab)

    # Define patterns for medical relationships (e.g., symptom-disease, drug-treatment, etc.)
    patterns = [
        {"label": "SYMPTOM_DISEASE", "pattern": [{"dep": "nsubj"}, {"dep": "ROOT"}, {"dep": "dobj"}]},  # symptom -> disease
        {"label": "DRUG_TREATMENT", "pattern": [{"dep": "nsubj"}, {"dep": "ROOT"}, {"dep": "dobj"}]},  # drug -> treatment
    ]
    
    # Add patterns to the matcher
    for pattern in patterns:
        matcher.add(pattern["label"], [pattern["pattern"]])

    # Apply matcher to the document
    matches = matcher(doc)

    # Extract relationships from the matches
    relations = []
    for match_id, start, end in matches:
        span = doc[start:end]  # Extract the span of the matched pattern
        label = nlp.vocab.strings[match_id]
        relations.append((label, span.text))

    return relations

# Extract medical relationships from each document
medical_relations = []
for doc_text in documents:
    doc = nlp(doc_text)
    relations = extract_medical_relations(doc)
    medical_relations.append(relations)

# === Output Medical Relations ===
medical_relations_df = pd.DataFrame(medical_relations, columns=["Relations"])
medical_relations_df.to_csv("25-3-2025_14_29_3/medical_relations.csv", index=False)
print("✅ Medical relations saved at: 25-3-2025_14_29_3/medical_relations.csv")         

In [None]:
import pandas as pd

# Load the ngram CSV file (replace with your actual file path)
ngram_df = pd.read_csv("25-3-2025_14_29_3_5/ngrams_above_15.csv")  # Replace with your file path

# Check the column names
print(ngram_df.columns)

In [6]:
import pandas as pd

# Load the ngram CSV file (replace with your actual file path)
ngram_df = pd.read_csv("25-3-2025_14_29_3_5/ngram_total_frequency.csv")  # Replace with your file path

# Check the column names
print(ngram_df.columns)

Index(['Unnamed: 0', 'total_frequency'], dtype='object')


In [12]:
import pandas as pd

# Define the priority cancer keywords
priority_keywords = set([
    "cancer", "tumor", "tumour", "metastasis", "oncology", "chemotherapy", "radiation", "immunotherapy", "biopsy",
    "mutation", "gene", "cell", "leukemia", "lymphoma", "sarcoma", "melanoma",
    "therapy", "drug", "diagnosis", "survival", "treatment", "carcinoma", "neoplasm", "malignancy", "genetic", "genomics"
])

# Load the CSV file
df = pd.read_csv("25-3-2025_14_29_3_5/ngram_total_frequency.csv")

# Rename the 'Unnamed: 0' column to 'ngram'
df = df.rename(columns={'Unnamed: 0': 'ngram'})

# Convert the total_frequency column to numeric
df['total_frequency'] = pd.to_numeric(df['total_frequency'], errors='coerce')

# Drop rows with NaN in total_frequency
df = df.dropna(subset=['total_frequency'])

# Define function to check for priority keywords
def contains_priority_keyword(ngram):
    return any(word.lower() in priority_keywords for word in ngram.split())

# Correct filtering:
filtered_df = df[
    (df['total_frequency'] > 15) | 
    ((df['total_frequency'] <= 15) & (df['ngram'].apply(contains_priority_keyword)))
]

# Save to a new CSV file
filtered_df.to_csv("25-3-2025_14_29_3_5/ngrams_above_15_with_priority.csv", index=False)

print("✅ Filtered ngrams saved successfully.")  

✅ Filtered ngrams saved successfully.


In [16]:
import pandas as pd

# Load the CSV
ngram_df = pd.read_csv("25-3-2025_14_29_3_5/ngrams_above_15.csv")

# Rename the 'Unnamed: 0' column to 'ngram' for clarity
ngram_df.rename(columns={"Unnamed: 0": "ngram"}, inplace=True)

# Function to normalize ngrams by sorting their tokens
def normalize_ngram(text):
    tokens = text.lower().split()
    sorted_tokens = sorted(tokens)
    return ' '.join(sorted_tokens)

# Apply normalization
ngram_df["normalized_ngram"] = ngram_df["ngram"].apply(normalize_ngram)

# Drop duplicates based on normalized ngram
ngram_df = ngram_df.drop_duplicates(subset=["normalized_ngram"])

# Drop the helper column if you want
ngram_df = ngram_df.drop(columns=["normalized_ngram"])

# Save the cleaned output
ngram_df.to_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", index=False)

print("✅ Duplicates removed and cleaned data saved as 'cleaned_ngrams.csv'")  

✅ Duplicates removed and cleaned data saved as 'cleaned_ngrams.csv'


In [18]:
import pandas as pd

# Load the CSV
ngram_df = pd.read_csv("25-3-2025_14_29_3_5/ngrams_above_15_with_priority.csv")

# Rename the 'Unnamed: 0' column to 'ngram' for clarity
ngram_df.rename(columns={"Unnamed: 0": "ngram"}, inplace=True)

# Function to normalize ngrams by sorting their tokens
def normalize_ngram(text):
    tokens = text.lower().split()
    sorted_tokens = sorted(tokens)
    return ' '.join(sorted_tokens)

# Apply normalization
ngram_df["normalized_ngram"] = ngram_df["ngram"].apply(normalize_ngram)

# Drop duplicates based on normalized ngram
ngram_df = ngram_df.drop_duplicates(subset=["normalized_ngram"])

# Drop the helper column if you want
ngram_df = ngram_df.drop(columns=["normalized_ngram"])

# Save the cleaned output
ngram_df.to_csv("25-3-2025_14_29_3_5/cleaned_ngrams_priority.csv", index=False)

print("✅ Duplicates removed and cleaned data saved as 'cleaned_ngrams_priority.csv'")    

✅ Duplicates removed and cleaned data saved as 'cleaned_ngrams_priority.csv'


In [None]:
import pandas as pd

# Load CSVs
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/ngrams_above_15.csv")
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity.csv", index_col=0)

# Rename columns for consistency
ngrams_freq.columns = ["ngram", "frequency"]
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# Filter 1: Ngram-Ngram Jaccard > 0.33
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# Save ngram-ngram dependencies
ngram_edges.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies.csv", index=False)

# Ensure ngram_paper contains numeric values for relevance scores
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')

# Filter 2: Ngram-Paper relevance > 0
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# Save ngram-paper links
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies.csv & ngram_paper_links.csv") 

In [None]:
import pandas as pd

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships ===
ngram_edges.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies.csv & ngram_paper_links.csv")  

In [None]:
import pandas as pd
import spacy
from scispacy.linking import EntityLinker
import scispacy

# === Load SciSpacy's Medical model (sm version) ===
nlp = spacy.load("en_core_sci_sm")  # Using the small version of the model
linker = EntityLinker.from_pretrained("scispacy-linker-md")

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Function to perform medical dependency parsing using SciSpacy ===
def get_medical_dependencies(ngram):
    # Process the ngram with SciSpacy to extract dependencies
    doc = nlp(ngram)
    doc._.linker = linker  # Apply the entity linker for medical entity recognition
    dependencies = []
    
    for token in doc:
        # Store each token's word, its dependency relation, and the head word
        dependencies.append((token.text, token.dep_, token.head.text))
    
    return dependencies

# === Apply dependency parsing to all n-grams ===
ngrams_freq['dependencies'] = ngrams_freq['ngram'].apply(get_medical_dependencies)

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships with medical dependencies ===
ngram_edges_with_dependencies = ngram_edges.merge(ngrams_freq[['ngram', 'dependencies']], left_on='source_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'source_ngram', 'dependencies': 'source_dependencies'})

ngram_edges_with_dependencies = ngram_edges_with_dependencies.merge(ngrams_freq[['ngram', 'dependencies']], left_on='target_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'target_ngram', 'dependencies': 'target_dependencies'})

# Save ngram-ngram dependencies with their parsed medical dependencies
ngram_edges_with_dependencies.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies_with_medical_parsing_sm.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies_with_medical_parsing_sm.csv & ngram_paper_links.csv")   

In [None]:
import pandas as pd
import spacy
import scispacy
from scispacy.linking import EntityLinker

# === Load SciSpacy's Medical model (sm version) ===
nlp = spacy.load("en_core_sci_sm")  # Using the small version of the model

# Load the entity linker
linker = EntityLinker(resolve_abbreviations=True)

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Function to perform medical dependency parsing using SciSpacy ===
def get_medical_dependencies(ngram):
    # Process the ngram with SciSpacy to extract dependencies
    doc = nlp(ngram)
    doc._.linker = linker  # Apply the entity linker for medical entity recognition
    dependencies = []
    
    for token in doc:
        # Store each token's word, its dependency relation, and the head word
        dependencies.append((token.text, token.dep_, token.head.text))
    
    return dependencies

# === Apply dependency parsing to all n-grams ===
ngrams_freq['dependencies'] = ngrams_freq['ngram'].apply(get_medical_dependencies)

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships with medical dependencies ===
ngram_edges_with_dependencies = ngram_edges.merge(ngrams_freq[['ngram', 'dependencies']], left_on='source_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'source_ngram', 'dependencies': 'source_dependencies'})

ngram_edges_with_dependencies = ngram_edges_with_dependencies.merge(ngrams_freq[['ngram', 'dependencies']], left_on='target_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'target_ngram', 'dependencies': 'target_dependencies'})

# Save ngram-ngram dependencies with their parsed medical dependencies
ngram_edges_with_dependencies.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies_with_medical_parsing_sm.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies_with_medical_parsing_sm.csv & ngram_paper_links.csv")   

In [None]:
import pandas as pd
import spacy

# === Load SciSpacy's Medical model (small version) ===
nlp = spacy.load("en_core_sci_sm")  # SciSpacy's small model for medical/scientific text

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Function to perform dependency parsing ===
def get_dependencies(ngram):
    # Process the ngram with spaCy to extract dependencies
    doc = nlp(ngram)
    dependencies = []
    for token in doc:
        # Store each token's word, its dependency relation, and the head word
        dependencies.append((token.text, token.dep_, token.head.text))
    return dependencies

# === Apply dependency parsing to all n-grams ===
ngrams_freq['dependencies'] = ngrams_freq['ngram'].apply(get_dependencies)

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships with dependencies ===
ngram_edges_with_dependencies = ngram_edges.merge(ngrams_freq[['ngram', 'dependencies']], left_on='source_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'source_ngram', 'dependencies': 'source_dependencies'})

ngram_edges_with_dependencies = ngram_edges_with_dependencies.merge(ngrams_freq[['ngram', 'dependencies']], left_on='target_ngram', right_on='ngram', how='left')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'])
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'target_ngram', 'dependencies': 'target_dependencies'})

# Save ngram-ngram dependencies with their parsed dependencies
ngram_edges_with_dependencies.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies_with_parsing.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies_with_parsing.csv & ngram_paper_links.csv")     

In [None]:
import pandas as pd
import spacy

# === Load SciSpacy's Medical model (small version) ===
nlp = spacy.load("en_core_sci_sm")  # SciSpacy's small model for medical/scientific text

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Function to perform dependency parsing ===
def get_dependencies(ngram):
    # Process the ngram with spaCy to extract dependencies
    doc = nlp(ngram)
    dependencies = []
    for token in doc:
        # Store each token's word, its dependency relation, and the head word
        dependencies.append((token.text, token.dep_, token.head.text))
    return dependencies

# === Apply dependency parsing to all n-grams ===
ngrams_freq['dependencies'] = ngrams_freq['ngram'].apply(get_dependencies)   

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where(ngram_jaccard > jaccard_threshold)
ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships with dependencies ===
ngram_edges_with_dependencies = ngram_edges.merge(ngrams_freq[['ngram', 'dependencies']], left_on='source_ngram', right_on='ngram', how='left')

# Print column names after first merge for debugging
print(ngram_edges_with_dependencies.columns)  # Check column names

# If 'ngram_y' exists, drop it
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'], errors='ignore')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'source_ngram', 'dependencies': 'source_dependencies'})

# Merge for the target_ngram
ngram_edges_with_dependencies = ngram_edges_with_dependencies.merge(ngrams_freq[['ngram', 'dependencies']], left_on='target_ngram', right_on='ngram', how='left')

# Print column names after second merge for debugging
print(ngram_edges_with_dependencies.columns)  # Check column names again

# If 'ngram_y' exists again, drop it
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'], errors='ignore')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'target_ngram', 'dependencies': 'target_dependencies'})

# Save ngram-ngram dependencies with their parsed dependencies
ngram_edges_with_dependencies.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies_with_parsing.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies_with_parsing.csv & ngram_paper_links.csv") 

In [22]:
import pandas as pd
import spacy

# === Load SciSpacy's Medical model (small version) ===
nlp = spacy.load("en_core_sci_sm")  # SciSpacy's small model for medical/scientific text

# === Load CSV files ===
# Load cleaned n-grams (with placeholder frequency if not present)
ngrams_freq = pd.read_csv("25-3-2025_14_29_3_5/cleaned_ngrams_priority.csv", names=["ngram", "frequency"])
ngram_paper = pd.read_csv("25-3-2025_14_29_3_5/output_with_entities.csv", index_col=0)
ngram_jaccard = pd.read_csv("25-3-2025_14_29_3_5/ngram_jaccard_similarity_2.csv", index_col=0)

# === Frequency mapping ===
ngram_frequencies = ngrams_freq.set_index("ngram")["frequency"]

# === Function to perform dependency parsing ===
def get_dependencies(ngram):
    # Process the ngram with spaCy to extract dependencies
    doc = nlp(ngram)
    dependencies = []
    for token in doc:
        # Store each token's word, its dependency relation, and the head word
        dependencies.append((token.text, token.dep_, token.head.text))
    return dependencies

# === Apply dependency parsing to all n-grams ===
ngrams_freq['dependencies'] = ngrams_freq['ngram'].apply(get_dependencies)

# === Filter 1: Ngram-Ngram Jaccard > threshold ===
#jaccard_threshold = 0.33
ngram_edges = ngram_jaccard.where((ngram_jaccard > 0.33) & (ngram_jaccard < 1.0))

ngram_edges = ngram_edges.stack().reset_index()
ngram_edges.columns = ["source_ngram", "target_ngram", "similarity"]
ngram_edges["source_freq"] = ngram_edges["source_ngram"].map(ngram_frequencies)
ngram_edges["target_freq"] = ngram_edges["target_ngram"].map(ngram_frequencies)

# === Save ngram-ngram relationships with dependencies ===
ngram_edges_with_dependencies = ngram_edges.merge(ngrams_freq[['ngram', 'dependencies']], left_on='source_ngram', right_on='ngram', how='left')

# Print column names after first merge for debugging
print(ngram_edges_with_dependencies.columns)  # Check column names

# If 'ngram_y' exists, drop it
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'], errors='ignore')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'source_ngram', 'dependencies': 'source_dependencies'})

# Merge for the target_ngram
ngram_edges_with_dependencies = ngram_edges_with_dependencies.merge(ngrams_freq[['ngram', 'dependencies']], left_on='target_ngram', right_on='ngram', how='left')

# Print column names after second merge for debugging
print(ngram_edges_with_dependencies.columns)  # Check column names again

# If 'ngram_y' exists again, drop it
ngram_edges_with_dependencies = ngram_edges_with_dependencies.drop(columns=['ngram_y'], errors='ignore')
ngram_edges_with_dependencies = ngram_edges_with_dependencies.rename(columns={'ngram_x': 'target_ngram', 'dependencies': 'target_dependencies'})

# Save ngram-ngram dependencies with their parsed dependencies
ngram_edges_with_dependencies.to_csv("25-3-2025_14_29_3_5/medical_ngram_dependencies_with_parsing_prio.csv", index=False)

# === Filter 2: Ngram-Paper Relevance > 0 ===
ngram_paper = ngram_paper.apply(pd.to_numeric, errors='coerce')
ngram_paper_long = ngram_paper[ngram_paper > 0].stack().reset_index()
ngram_paper_long.columns = ["ngram", "paper", "score"]
ngram_paper_long["ngram_freq"] = ngram_paper_long["ngram"].map(ngram_frequencies)

# === Save ngram-paper links ===
ngram_paper_long.to_csv("25-3-2025_14_29_3_5/ngram_paper_links_prio.csv", index=False)

print("✅ Files saved: medical_ngram_dependencies_with_parsing.csv & ngram_paper_links.csv")       

Index(['source_ngram', 'target_ngram', 'similarity', 'source_freq',
       'target_freq', 'ngram', 'dependencies'],
      dtype='object')
Index(['source_ngram', 'target_ngram', 'similarity', 'source_freq',
       'target_freq', 'ngram_x', 'source_dependencies', 'ngram_y',
       'dependencies'],
      dtype='object')
✅ Files saved: medical_ngram_dependencies_with_parsing.csv & ngram_paper_links.csv


In [23]:
# Step 1: Install dependencies (only run once)
!pip install neo4j pandas

# Step 2: Import libraries
from neo4j import GraphDatabase
import pandas as pd
import ast

# Step 3: Connect to Neo4j
uri = "bolt://localhost:7687"
username = "neo4j"
password = "sourabh@123"  # 🔐 Replace with your Neo4j password

driver = GraphDatabase.driver(uri, auth=(username, password))

# Step 4: Load the dependency CSV
file_path = "25-3-2025_14_29_3_5/medical_ngram_dependencies_with_parsing.csv"
df = pd.read_csv(file_path)

# Step 5: Convert stringified dependencies back to lists of tuples
df['source_dependencies'] = df['source_dependencies'].apply(ast.literal_eval)
df['target_dependencies'] = df['target_dependencies'].apply(ast.literal_eval)

# Step 6: Define upload logic with better relationships
def create_ngram_graph(tx, source, target, similarity, source_deps, target_deps):
    tx.run("""
        MERGE (src:Ngram {text: $source})
        MERGE (tgt:Ngram {text: $target})
        MERGE (src)-[s:SIMILAR_TO]->(tgt)
        SET s.similarity = $similarity
    """, source=source, target=target, similarity=similarity)

    for word, dep, head in source_deps:
        tx.run("""
            MATCH (n:Ngram {text: $ngram})
            MERGE (t:Token {text: $word})
            MERGE (h:Token {text: $head})
            MERGE (t)-[:DEPENDS_ON {type: $dep}]->(h)
            MERGE (n)-[:CONTAINS]->(t)
        """, ngram=source, word=word, dep=dep, head=head)

    for word, dep, head in target_deps:
        tx.run("""
            MATCH (n:Ngram {text: $ngram})
            MERGE (t:Token {text: $word})
            MERGE (h:Token {text: $head})
            MERGE (t)-[:DEPENDS_ON {type: $dep}]->(h)
            MERGE (n)-[:CONTAINS]->(t)
        """, ngram=target, word=word, dep=dep, head=head)

# Step 7: Upload to Neo4j
with driver.session() as session:
    for _, row in df.iterrows():
        session.write_transaction(
            create_ngram_graph,
            row['source_ngram'],
            row['target_ngram'],
            row['similarity'],
            row['source_dependencies'],
            row['target_dependencies']
        )
 
driver.close()  



  session.write_transaction(
