In [34]:
# !pip install pdfplumber nltk numpy rapidfuzz
# !pip install cloudinary
# !pip install python-dotenv

In [3]:
import pdfplumber
import glob
import string
import nltk
from dotenv import load_dotenv
import cloudinary
from cloudinary import CloudinaryImage
import cloudinary.uploader
import cloudinary.api
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import regex as re
import base64
import numpy as np
from multiprocessing import Pool, cpu_count
import pickle
import os
from rapidfuzz import fuzz
from rapidfuzz import process
import json
from custom_bm25 import BM25Okapi


In [None]:

keywords_to_remove = (
    'Score:', 'Episodes', 'Status', 'Aired', 'Premiered', 'Broadcast', 'Licensors',
    'Source', 'Duration', 'Characters', 'Anime Details', 'Japanese', 'Main Supporting',
    'Main Main', 'None', 'Demographic', 'Genres', 'Studios',
    'Data source: MyAnimeList', 'Supporting Supporting', '[Written by MAL Rewrite]'
)

animes_corpus = []
animes_json = []

pdfs = glob.glob(r'C:\Users\menno\Desktop\AI-location\animes_search_engine\pdf_transformer\pdfs\*.pdf')

for index, pdf in enumerate(pdfs):
    with pdfplumber.open(pdf) as f:
        
        txt = ''.join(page.extract_text() for page in f.pages)
        list_sentences = txt.split('\n')
        
        # Filter sentences based on keywords
        filtered_sentences = [
            i for i in list_sentences if not any(i.startswith(keyword) for keyword in keywords_to_remove)
        ]
        animes_corpus.append('\n'.join(filtered_sentences))

        # Extract specific fields
        def find_value(key, sentences):
            for i, sentence in enumerate(sentences):
                if sentence.lower().startswith(key.lower()):
                    return i
            return None

        score_index = find_value('score:', list_sentences)
        genres_index = find_value('Genres', list_sentences)
        description_end_index = find_value('Anime Details',list_sentences)
        demographic_index = find_value('Demographic', list_sentences)
        studios_index = find_value('Studios', list_sentences)
        premiered_index = find_value('Premiered', list_sentences)

        # Handling fields
        def extract_field_value(index, label):
            if index is not None:
                parts = list_sentences[index].split(label)
                return parts[1].strip() if len(parts) > 1 else list_sentences[index + 1].strip()
            return None

        demographic_value = extract_field_value(demographic_index, 'Demographic')
        studios_value = extract_field_value(studios_index, 'Studios')
        premiered_value = extract_field_value(premiered_index, 'Premiered') 
        genres_value =  list_sentences[genres_index].split('Genres')[1].strip().split( 'Demographic')[0].split('Duration')[0].split(', ') if genres_index else None

        premiered_list = premiered_value.split() if premiered_value is not None else None 

        # JSON format output per doc
        animes_json.append({
            'doc_name': os.path.basename(pdf),
            'title': f.metadata.get('Title'),
            'image':base64.b64encode(f.images[0]["stream"].get_data()).decode('utf-8')  ,
            'score': list_sentences[score_index].split()[1],
            'description': '\n'.join(list_sentences[score_index + 1:description_end_index]) if score_index is not None and description_end_index is not None else None,
            'genres': genres_value,
            'demographic': demographic_value.split('Duration')[0] if demographic_value is not None else None,
            'studios': studios_value,
            'premiered': {
                "season":premiered_list[0]  ,
                "year": premiered_list[1]
            } if premiered_list is not None else None
        })
        
    print(f'anime {index + 1}')
    

In [37]:
# print(animes_corpus)

In [2]:
load_dotenv() 

cloudinary.config( 
  cloud_name = os.getenv("CLOUD_NAME"), 
  api_key = os.getenv("API_KEY"), 
  api_secret = os.getenv("API_SECRET"),
  secure = True
)


def upload_base64_to_cloudinary(base64_image, folder="animes"):
    
    try:
        # Upload to Cloudinary with folder specification
        response = cloudinary.uploader.upload(
            f"data:image/png;base64,{base64_image}",
            folder=folder,  # Specify the folder
            resource_type="auto"  # Automatically detect resource type
        )
        return response
    except Exception as e:
        print(f"Error uploading image: {str(e)}")
        return None



<cloudinary.Config at 0x24b41704710>

In [39]:
# Process your anime list
i = 1
for anime in animes_json:
    try:
        # Upload the image and get response
        response = upload_base64_to_cloudinary(anime['image'])
        
        if response:
            i = i+1

            print(f"Successfully uploaded image for {anime['doc_name']} anime number {i} ")
            # Optionally store the URL back in your anime dictionary
            anime['image'] = response['secure_url']
        else:
            print(f"Failed to upload image for {anime['doc_name']}")
            
    except Exception as e:
        print(f"Error processing anime: {str(e)}")

In [40]:

# Save animes_json to a file
with open('animes_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(animes_json, json_file, ensure_ascii=False, indent=4)

print("animes_json has been saved to 'animes_data.json'")


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [42]:
#lower, remove punctuations , remove stop words , tokenize input
def filter_text(input_string , string , stop_words):
    tokenized_input = input_string.translate(str.maketrans('', '', string.punctuation)).lower().split()
    filterd_tokenized_input = [w for w in tokenized_input if not w in stop_words]
    filterd_tokenized_input = [ps.stem(lemmatizer.lemmatize(i)) for i in filterd_tokenized_input ]
    return filterd_tokenized_input


In [43]:
tokenized_cleaned_corpus = []
for doc in animes_corpus:
    clean_doc = filter_text(doc , string , stop_words)
    tokenized_cleaned_corpus.append(clean_doc)


In [44]:
# print(tokenized_cleaned_corpus)

In [45]:
def flatten_and_remove_duplicates(nested_list):
    # Flatten the list using recursion
    def flatten(lst):
        for item in lst:
            if isinstance(item, list):
                yield from flatten(item)
            else:
                yield item

    # Flatten the list and convert it to a set to remove duplicates
    flattened = list(flatten(nested_list))
    unique_items = []
    for item in flattened:
        if len(item) > 2 and item not in unique_items:  # Skip items with length 1 and 2
            unique_items.append(item)

    return unique_items

# flatten_corpus = flatten_and_remove_duplicates(tokenized_cleaned_corpus)

list_to_flatten = []

for doc in animes_corpus:
    clean_doc = doc.translate(str.maketrans('', '', string.punctuation)).lower().split()
    list_to_flatten.append(clean_doc)

flatten_corpus = flatten_and_remove_duplicates(list_to_flatten)

In [46]:
# print(flatten_corpus)

In [47]:
with open('flatten_corpus.json', 'w', encoding='utf-8') as json_file:
    json.dump(flatten_corpus, json_file, ensure_ascii=False)

In [48]:
bm25 = BM25Okapi(tokenized_cleaned_corpus)

In [49]:
# # # save model
file_name = r"models\model.pkl"

os.makedirs(os.path.dirname(file_name),exist_ok=True)

with open(file_name, 'wb') as file:
    pickle.dump(bm25, file)

In [50]:
#loading the bm25 model
with open(r"models\model.pkl", 'rb') as file:
    model = pickle.load(file)

# loading the flatten corpus for the fuzzy search
with open('flatten_corpus.json', 'r', encoding='utf-8') as json_file:
    flatten_corpus = json.load(json_file)

# loading the the pdfs jsons : results
with open('animes_data.json', 'r', encoding='utf-8') as json_file:
    animes_json = json.load(json_file)

doc_names = [anime['title'] for anime in animes_json]




In [None]:

query = "one piece"

tokenized_query = query.translate(str.maketrans('', '', string.punctuation)).lower().split()

fuzzy_tokenized_query_list = []

for q in tokenized_query:
    if len(q) <= 2:
        q=q
    else :
        # print(process.extract(q,flatten_corpus,limit=3))
        fuzzy_query = process.extractOne(q, flatten_corpus)
        q = fuzzy_query[0] if fuzzy_query[1] > 79 else q
        
    fuzzy_tokenized_query_list.append(q)

fuzzy_tokenized_query = ' '.join(fuzzy_tokenized_query_list)
print(fuzzy_tokenized_query)

fuzzy_tokenized_cleaned_query = filter_text(fuzzy_tokenized_query , string , stop_words)
print(fuzzy_tokenized_cleaned_query)

result = model.get_top_n(fuzzy_tokenized_cleaned_query , doc_names, n = 100)
print(result)


In [55]:



# query = "one one one one one"
# tokenized_cleaned_query = filter_text(query , string , stop_words)

# fuzzy_tokenized_cleaned_query = []

# for q in tokenized_cleaned_query:
#     if len(q) <= 2:
#         q=q
#     else :
#         q = process.extractOne(q, flatten_corpus)[0]
#         print(process.extract(q,flatten_corpus,limit=2))
#     fuzzy_tokenized_cleaned_query.append(q)

# print(fuzzy_tokenized_cleaned_query)


# result = model.get_top_n(fuzzy_tokenized_cleaned_query , doc_names, n = len(doc_names))
# print(result)
