In [16]:
import re
from collections import defaultdict
import pandas as pd

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

def find_keyword_frequency(articles, keywords):
    keyword_frequency = defaultdict(lambda: defaultdict(int))
    keywords = [keyword.lower() for keyword in keywords]
    for heading, content in articles.items():
        combined_text = ' '.join(content)
        processed_text = preprocess_text(combined_text)
        for keyword in keywords:
            keyword_count = len(re.findall(rf'\b{keyword}\b', processed_text))
            keyword_frequency[heading][keyword] += keyword_count
    return keyword_frequency

def parse_markdown(markdown_text):
    articles = defaultdict(list)
    current_article = None
    keywords = []
    markdown_text = markdown_text.lower()
    for line in markdown_text.split('\n'):
        if line.startswith('## '):
            current_article = line[3:].strip()
            if current_article:
                articles[current_article] = []
                keywords.extend(extract_keywords(current_article))
        elif line.strip() and current_article:
            articles[current_article].append(line.strip())
    
    return articles, keywords
def extract_keywords(heading):
    keywords = []
    clean_heading = re.sub(r'^\d+(\.\d+)*\s*', '', heading.strip().lower())
    matches = re.findall(r'(.+?)\s*\((.*?)\)', clean_heading)
    for match in matches:
        full_name, acronym = match
        keywords.append(full_name.strip())
        keywords.append(acronym.strip())
    if not matches:
        keywords.append(clean_heading)
    return keywords
def read_markdown_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
markdown_file_path = 'output.md' ##markdown text 
markdown_text = read_markdown_file(markdown_file_path)
articles, keywords = parse_markdown(markdown_text)
keyword_frequency = find_keyword_frequency(articles, keywords)
print("Keywords extracted from headings:", keywords)
for heading, frequencies in keyword_frequency.items():
    print(f"\nHeading: {heading}")
    for keyword, count in frequencies.items():
        print(f"{keyword}: {count}")


Keywords extracted from headings: ['pegmatite', 'kimberlite', 'octahedral growth', 'tunnel mining', 'artisanal mining', 'andradite', 'pyrope-spessartine', 'enamel', 'imperial jade', 'rubellite', 'hydrophane', 'nacre', 'trapiche', 'pinctada margaritifera', 'pinctada maxima', 'la-icp-ms', 'laser ablation inductively coupled plasma mass spectrometry', 'laboratory irradiation', 'x-ray microradiography', 'rtx', 'photoluminescence', 'pl', 'diamondview', 'cvd', 'chemical vapor deposition', 'hpht', 'high pressure high temperature', 'nitrogen defects', 'diamonds', 'etch channels', 'gemstone impurities', 'lab-grown diamonds']

Heading: pegmatite
pegmatite: 11
kimberlite: 0
octahedral growth: 0
tunnel mining: 0
artisanal mining: 0
andradite: 0
pyrope-spessartine: 0
enamel: 0
imperial jade: 0
rubellite: 0
hydrophane: 0
nacre: 0
trapiche: 0
pinctada margaritifera: 0
pinctada maxima: 0
la-icp-ms: 0
laser ablation inductively coupled plasma mass spectrometry: 0
laboratory irradiation: 0
x-ray microra

In [18]:
data = {'keyword': list(keyword_frequency.keys())}
for keyword in keywords:
    data[keyword] = [keyword_frequency[heading].get(keyword, 0) for heading in keyword_frequency]

df = pd.DataFrame(data)
df
df_transposed = df.T
df_transposed = df.set_index('keyword').T
df_transposed

keyword,pegmatite,kimberlite,octahedral growth,tunnel mining,artisanal mining,andradite,pyrope-spessartine,enamel,imperial jade,rubellite,...,laboratory irradiation,x-ray microradiography (rtx),photoluminescence (pl) spectroscopy,diamondview,cvd (chemical vapor deposition),hpht (high pressure high temperature),nitrogen defects (diamonds),etch channels,gemstone impurities,lab-grown diamonds
pegmatite,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kimberlite,0,15,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
octahedral growth,0,0,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tunnel mining,0,0,0,14,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
artisanal mining,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
andradite,0,0,0,0,0,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
pyrope-spessartine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
enamel,0,0,0,0,0,0,0,41,0,0,...,0,0,0,0,0,0,0,0,0,0
imperial jade,0,0,0,0,0,0,0,0,16,0,...,0,0,0,0,0,0,0,0,0,0
rubellite,0,0,0,0,0,0,0,0,0,24,...,0,0,0,0,0,0,0,0,0,0


In [20]:
def calculate_row_scores(df):
    row_scores = {}
    for index, row in df.iterrows():
        row_numeric = pd.to_numeric(row, errors='coerce').fillna(0)
        score = row_numeric.sum()
        diagonal_value = row_numeric[index] if index in row_numeric.index else 0
        adjusted_score = score - diagonal_value
        row_scores[index] = adjusted_score
    return row_scores
row_scores = calculate_row_scores(df_transposed)
for heading, score in row_scores.items():
    print(f"Relevance Score for '{heading}': {score}")

Relevance Score for 'pegmatite': 0
Relevance Score for 'kimberlite': 2
Relevance Score for 'octahedral growth': 0
Relevance Score for 'tunnel mining': 1
Relevance Score for 'artisanal mining': 0
Relevance Score for 'andradite': 3
Relevance Score for 'pyrope-spessartine': 0
Relevance Score for 'enamel': 0
Relevance Score for 'imperial jade': 0
Relevance Score for 'rubellite': 0
Relevance Score for 'hydrophane': 4
Relevance Score for 'nacre': 7
Relevance Score for 'trapiche': 6
Relevance Score for 'pinctada margaritifera': 2
Relevance Score for 'pinctada maxima': 6
Relevance Score for 'la-icp-ms': 0
Relevance Score for 'laser ablation inductively coupled plasma mass spectrometry': 1
Relevance Score for 'laboratory irradiation': 0
Relevance Score for 'x-ray microradiography': 0
Relevance Score for 'rtx': 2
Relevance Score for 'photoluminescence': 9
Relevance Score for 'pl': 15
Relevance Score for 'diamondview': 5
Relevance Score for 'cvd': 26
Relevance Score for 'chemical vapor deposition

In [22]:
import pandas as pd
def calculate_row_scores(df):
    row_scores = {}
    for index, row in df.iterrows():
        heading = index  
        row_numeric = pd.to_numeric(row, errors='coerce').fillna(0)
        diagonal_value = row_numeric[heading] if heading in row_numeric.index else 0
        score = row_numeric.sum()
        adjusted_score = 1 if (score - diagonal_value) >= 1 else 0
        row_scores[heading] = adjusted_score
    return row_scores
row_scores = calculate_row_scores(df_transposed)
for heading, score in row_scores.items():
    print(f"Relevance Score for '{heading}': {score}")


Relevance Score for 'pegmatite': 0
Relevance Score for 'kimberlite': 1
Relevance Score for 'octahedral growth': 0
Relevance Score for 'tunnel mining': 1
Relevance Score for 'artisanal mining': 0
Relevance Score for 'andradite': 1
Relevance Score for 'pyrope-spessartine': 0
Relevance Score for 'enamel': 0
Relevance Score for 'imperial jade': 0
Relevance Score for 'rubellite': 0
Relevance Score for 'hydrophane': 1
Relevance Score for 'nacre': 1
Relevance Score for 'trapiche': 1
Relevance Score for 'pinctada margaritifera': 1
Relevance Score for 'pinctada maxima': 1
Relevance Score for 'la-icp-ms': 0
Relevance Score for 'laser ablation inductively coupled plasma mass spectrometry': 1
Relevance Score for 'laboratory irradiation': 0
Relevance Score for 'x-ray microradiography': 0
Relevance Score for 'rtx': 1
Relevance Score for 'photoluminescence': 1
Relevance Score for 'pl': 1
Relevance Score for 'diamondview': 1
Relevance Score for 'cvd': 1
Relevance Score for 'chemical vapor deposition':