# **IMPORT MODEL**

In [1]:
from keras.models import load_model
import pickle

# Define the paths
model_path = 'model2.keras'
tokenizer_path = 'tokenizer.pickle'
label_encoder_path = 'label_encoder.pickle'

model = load_model('model2.keras')

# Load the tokenizer
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoder
with open(label_encoder_path, 'rb') as handle:
    label_encoder = pickle.load(handle)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# **PDF INPUT**

In [4]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9


In [7]:
import fitz
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import re

PATH_TO_PDF = '515729.pdf'

def remove_non_characters(text):
    # regular expression pattern
    pattern = r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    # List for the PDF  Text
    text_list = []

    references_index = 0
    search_words = ["kaynaklar", "literatür listesi", "kaynaklar dizini", "kaynakça", "referanslar", "bibliyografya"]

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        text_list.append(page_text)

    # ALgorithm to find References Page
    for page_num in range(len(pdf_document)-1, 0, -1):
      page = pdf_document.load_page(page_num)
      page_text = page.get_text()

      lines = page_text.split('\n')
      lines = [line for line in lines if line.strip()]
      for line in range(min(len(lines),5)):
          if remove_non_characters(lines[line]).strip().lower() in search_words:
              references_index = page_num+1
              pdf_document.close()
              return text_list, references_index

    pdf_document.close()
    return text_list, references_index

new_texts, references_index = extract_text_from_pdf(PATH_TO_PDF)
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=200, padding='post', truncating='post')

predictions = tf.nn.sigmoid(model.predict(new_padded_sequences)).numpy()
predicted_labels = (predictions > 0.5).astype(int)
predicted_labels = label_encoder.inverse_transform(predicted_labels.flatten())

output_list = []

# Calculating length
length = len(predicted_labels)

# If not found make all pages after last 1 as 1
if references_index == 0 or references_index < int(length * 0.5):
    #update ekler(Appendix)
    i = len(predicted_labels) - 1
    while(predicted_labels[i] != 1):
      predicted_labels[i] = 1
      i -= 1
else:
    # Found
    end = references_index - 1
    found = True
    for i in range(end, len(predicted_labels)):
        predicted_labels[i] = 1

for i in range(len(predicted_labels)):
  if predicted_labels[i] == 1:
    output_list.append(i+1)

print(f"Toplam sayfa sayısı: {len(predicted_labels)}")
print(output_list)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251ms/step
Toplam sayfa sayısı: 100
[5, 94, 95, 96, 97, 98, 99, 100]
