In [1]:
import os
import pandas as pd
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams 
import fitz
from nltk.stem import PorterStemmer 

In [2]:
def extract_relevant_text(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    capture = False
    for page in doc:
        page_text = page.get_text()
        if "abstract" in page_text.lower():
            capture = True
        if "references" in page_text.lower():
            capture = False
        if capture:
            text += page_text + " "
    return text.lower()       

In [3]:
input_folder = "m_t_s_n_m/" 
output_folder = "output254" 
os.makedirs(output_folder , exist_ok=True)  

In [4]:
from nltk.stem import PorterStemmer 

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer() 
lemm = nltk.WordNetLemmatizer() 
unwanted_words = {'article', 'et', 'al', 'terms', 'conditions','publication','cititation', 'open', 'access', 'license', 'cc', 'by', 'creative', 'commons', 'attribution'}

In [5]:
global_bigram_counts = Counter()
global_trigram_counts = Counter()
bigram_paper_counts = {}
trigram_paper_counts = {} 

In [6]:
for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(input_folder, file_name)
        raw_text = extract_relevant_text(pdf_path)
        tokens = word_tokenize(raw_text)
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words and word not in unwanted_words]
        bigram_list = list(ngrams(tokens, 2))
        trigram_list = list(ngrams(tokens, 3))

        bigram_counts = Counter(bigram_list)
        trigram_counts = Counter(trigram_list)

        global_bigram_counts.update(bigram_counts)
        global_trigram_counts.update(trigram_counts)

        bigram_paper_counts[file_name] = bigram_counts
        trigram_paper_counts[file_name] = trigram_counts

        paper_folder = os.path.join(output_folder, file_name.split('.')[0])
        os.makedirs(paper_folder, exist_ok=True)
        
        with open(os.path.join(paper_folder, f"{file_name.split('.')[0]}_n_gram.txt"), "w", encoding="utf-8") as ngram_file:
            ngram_file.write("Bigrams:\n")
            for bigram, count in bigram_counts.items():
                ngram_file.write(f"{' '.join(bigram)}: {count}\n")
            
            ngram_file.write("\nTrigrams:\n")
            for trigram, count in trigram_counts.items():
                ngram_file.write(f"{' '.join(trigram)}: {count}\n") 

In [7]:
top_200_bigrams = global_bigram_counts.most_common(200)
top_200_trigrams = global_trigram_counts.most_common(200)

top_ngram_file = os.path.join(output_folder, "top_ngrams.txt")
with open(top_ngram_file, "w", encoding="utf-8") as top_file:
    top_file.write("Top 200 Bigrams:\n")
    for bigram, count in top_200_bigrams:
        top_file.write(f"{' '.join(bigram)}: {count}\n")

    top_file.write("\nTop 200 Trigrams:\n")
    for trigram, count in top_200_trigrams:
        top_file.write(f"{' '.join(trigram)}: {count}\n")

bigram_strings = [' '.join(bigram) for bigram, _ in top_200_bigrams]
trigram_strings = [' '.join(trigram) for trigram, _ in top_200_trigrams]

bigram_matrix = pd.DataFrame(0, index=bigram_strings, columns=bigram_paper_counts.keys())
trigram_matrix = pd.DataFrame(0, index=trigram_strings, columns=trigram_paper_counts.keys())

for paper, bigram_counts in bigram_paper_counts.items():
    for bigram, _ in top_200_bigrams:
        bigram_str = ' '.join(bigram)
        bigram_matrix.at[bigram_str, paper] = bigram_counts.get(bigram, 0)

for paper, trigram_counts in trigram_paper_counts.items():
    for trigram, _ in top_200_trigrams:
        trigram_str = ' '.join(trigram)
        trigram_matrix.at[trigram_str, paper] = trigram_counts.get(trigram, 0)

bigram_matrix.to_csv(os.path.join(output_folder, "bigram_cooccurrence_matrix.csv"))
trigram_matrix.to_csv(os.path.join(output_folder, "trigram_cooccurrence_matrix.csv")) 

In [8]:
df_bigrams = pd.read_csv(os.path.join(output_folder, "bigram_cooccurrence_matrix.csv"), index_col=0)
filtered_bigrams = df_bigrams[(df_bigrams.sum(axis=1) > 20)] 

In [9]:
df_trigrams = pd.read_csv(os.path.join(output_folder, "trigram_cooccurrence_matrix.csv"), index_col=0)
filtered_trigrams = df_trigrams[(df_trigrams.sum(axis=1) > 20)] 

In [11]:
from scipy.spatial import distance 

sim_list_bigrams = []
for i in range(len(filtered_bigrams)):
    for j in range(i + 1, len(filtered_bigrams)):
        row_1 = filtered_bigrams.iloc[i, :]
        row_2 = filtered_bigrams.iloc[j, :]
        cosine = 1 - distance.cosine(row_1, row_2)
        sim_list_bigrams.append({
            'ngram1': filtered_bigrams.index[i],
            'ngram2': filtered_bigrams.index[j],
            'cosine': cosine
        }) 

In [12]:
sim_list_trigrams = []
for i in range(len(filtered_trigrams)):
    for j in range(i + 1, len(filtered_trigrams)):
        row_1 = filtered_trigrams.iloc[i, :]
        row_2 = filtered_trigrams.iloc[j, :]
        cosine = 1 - distance.cosine(row_1, row_2)
        sim_list_trigrams.append({
            'ngram1': filtered_trigrams.index[i],
            'ngram2': filtered_trigrams.index[j],
            'cosine': cosine
        }) 

In [15]:
sorted_bigrams = pd.DataFrame(sim_list_bigrams).sort_values(by='cosine', ascending=False)
sorted_trigrams = pd.DataFrame(sim_list_trigrams).sort_values(by='cosine', ascending=False)

sorted_bigrams.to_csv(os.path.join(output_folder, "cosine_similarity_bigrams.csv"), index=False)
sorted_trigrams.to_csv(os.path.join(output_folder, "cosine_similarity_trigrams.csv"), index=False)


In [16]:
print("Top 50 bigram similarities:")
print(sorted_bigrams.head(50)) 

Top 50 bigram similarities:
                  ngram1                 ngram2  cosine
16289         dcnn model          proposed dcnn     1.0
12586        spot severe      improved cyclegan     1.0
1858           mild corn  feature recombination     1.0
1843           mild corn            rust severe     1.0
12549        spot severe              spot mild     1.0
12550        spot severe            blight mild     1.0
12551        spot severe          blight severe     1.0
12552        spot severe              rust mild     1.0
12553        spot severe            rust severe     1.0
12568        spot severe  feature recombination     1.0
12635          spot mild            blight mild     1.0
12755        blight mild      improved cyclegan     1.0
12636          spot mild          blight severe     1.0
12637          spot mild              rust mild     1.0
12638          spot mild            rust severe     1.0
12653          spot mild  feature recombination     1.0
12671          spot 

In [17]:
print("\nTop 50 trigram similarities:")
print(sorted_trigrams.head(50)) 


Top 50 trigram similarities:
                    ngram1               ngram2  cosine
978     blight severe corn       mild corn rust     1.0
785       leaf spot severe   blight severe corn     1.0
904       leaf blight mild     corn rust severe     1.0
903       leaf blight mild       mild corn rust     1.0
789       leaf spot severe       mild corn rust     1.0
788       leaf spot severe       rust mild corn     1.0
787       leaf spot severe       corn rust mild     1.0
786       leaf spot severe     severe corn rust     1.0
784       leaf spot severe   leaf blight severe     1.0
780       leaf spot severe       leaf spot mild     1.0
783       leaf spot severe     blight mild corn     1.0
902       leaf blight mild       rust mild corn     1.0
901       leaf blight mild       corn rust mild     1.0
100         corn gray leaf       mild corn leaf     1.0
1071        mild corn rust     rust severe corn     1.0
782       leaf spot severe     leaf blight mild     1.0
790       leaf spo

In [18]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0 

In [19]:
df_bigrams = pd.read_csv(os.path.join(output_folder, "bigram_cooccurrence_matrix.csv"), index_col=0)
filtered_bigrams = df_bigrams[(df_bigrams.sum(axis=1) > 20)] 

In [20]:
df_trigrams = pd.read_csv(os.path.join(output_folder, "trigram_cooccurrence_matrix.csv"), index_col=0)
filtered_trigrams = df_trigrams[(df_trigrams.sum(axis=1) > 20)] 

In [24]:
jaccard_list_bigrams = []
for i in range(len(filtered_bigrams)):
    for j in range(i + 1, len(filtered_bigrams)):
        set1 = set(filtered_bigrams.iloc[i, :].to_numpy().nonzero()[0])
        set2 = set(filtered_bigrams.iloc[j, :].to_numpy().nonzero()[0])
        jaccard = jaccard_similarity(set1, set2)
        jaccard_list_bigrams.append({'ngram1': filtered_bigrams.index[i], 'ngram2': filtered_bigrams.index[j], 'jaccard': jaccard})


In [25]:
jaccard_list_trigrams = []
for i in range(len(filtered_trigrams)):
    for j in range(i + 1, len(filtered_trigrams)):
        set1 = set(filtered_trigrams.iloc[i, :].to_numpy().nonzero()[0])
        set2 = set(filtered_trigrams.iloc[j, :].to_numpy().nonzero()[0])
        jaccard = jaccard_similarity(set1, set2)
        jaccard_list_trigrams.append({'ngram1': filtered_trigrams.index[i], 'ngram2': filtered_trigrams.index[j], 'jaccard': jaccard}) 

In [26]:
sorted_bigrams = pd.DataFrame(jaccard_list_bigrams).sort_values(by='jaccard', ascending=False)
sorted_trigrams = pd.DataFrame(jaccard_list_trigrams).sort_values(by='jaccard', ascending=False)

sorted_bigrams.to_csv(os.path.join(output_folder, "jaccard_similarity_bigrams.csv"), index=False)
sorted_trigrams.to_csv(os.path.join(output_folder, "jaccard_similarity_trigrams.csv"), index=False) 

In [27]:
print("Top 50 bigram Jaccard similarities:")
print(sorted_bigrams.head(50)) 

Top 50 bigram Jaccard similarities:
                       ngram1                 ngram2  jaccard
16289              dcnn model          proposed dcnn      1.0
10781       original cyclegan      improved cyclegan      1.0
4579                corn gray              rust mild      1.0
4580                corn gray            rust severe      1.0
4595                corn gray  feature recombination      1.0
4613                corn gray      improved cyclegan      1.0
6439            learning rate          loss function      1.0
6459            learning rate            feature map      1.0
6522            learning rate    disease recognition      1.0
7120         generated images         improved model      1.0
7136         generated images         disease images      1.0
7173         generated images         cyclegan model      1.0
7224         generated images       image generation      1.0
8555           improved model         disease images      1.0
8592           improved model     

In [28]:
print("\nTop 50 trigram Jaccard similarities:")
print(sorted_trigrams.head(50)) 


Top 50 trigram Jaccard similarities:
                    ngram1               ngram2  jaccard
975     blight severe corn     severe corn rust      1.0
116         corn gray leaf     spot severe corn      1.0
118         corn gray leaf       spot mild corn      1.0
119         corn gray leaf     leaf blight mild      1.0
120         corn gray leaf     blight mild corn      1.0
121         corn gray leaf   leaf blight severe      1.0
122         corn gray leaf   blight severe corn      1.0
123         corn gray leaf     severe corn rust      1.0
124         corn gray leaf       corn rust mild      1.0
125         corn gray leaf       rust mild corn      1.0
126         corn gray leaf       mild corn rust      1.0
127         corn gray leaf     corn rust severe      1.0
985     blight severe corn     rust severe corn      1.0
999       severe corn rust       corn rust mild      1.0
133         corn gray leaf     rust severe corn      1.0
1000      severe corn rust       rust mild corn   