# This code detects possible product names from PDF

This code will try to extract product names for now from a background database trying to match product names with the text in the PDF

If PyPDF2 is not in the system then use
!pip install PyPDF2

## Now we will write a program to extract text as a string from PDF

In [None]:
def extract_text_from_PDF(PDF_filename):
    import pdfplumber
    pdf_string="";
    with pdfplumber.open(PDF_filename) as pdf:
        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            pdf_string+=page.extract_text()
    return (pdf_string.replace("\n", " ")).replace("\t", " ")

### An example of running the function extracting text from PDF

In [None]:
#First lets view the PDF file
#If pdfplumber not installed
#!pip install pdfplumber
extract_text_from_PDF("SDS_Benzene.pdf")

# This part of the code returns a list of unique words from a string

In [None]:
#This function extracts unique words from text as a list
def unique_words_in_text(enter_string):
    #replacing things with space bar
    word_list0=enter_string.replace("\n"," ")
    word_list1=word_list0.replace("."," ")
    word_list2=word_list1.replace(","," ")
    word_list3=word_list2.replace(":"," ")
    word_list4=word_list3.replace(";"," ")
    word_list=word_list4.split() #extracting words from text
    unique_words=[]
    for i in range(len(word_list)):
        if word_list[i] not in unique_words:
            unique_words.append(word_list[i])
    return unique_words

## Let's try this on our SDS datasheet

In [None]:
unique_words_in_text(extract_text_from_PDF("SDS_Benzene.pdf"))

This works great!

Now we are trying to find words from an excel sheet (format - https://docs.google.com/spreadsheets/d/1tLXhP7Mz898eXFKOc9N5ORoxysvbtk0B/edit?usp=sharing&ouid=100028279162467888332&rtpof=true&sd=true) and match it to the words in the PDF. For that we write the following function

In [None]:
import pandas as pd
def unique_product_names_in_excel_file(insert_excel_file_name_with_extension):
    with open('text_file_name.txt', 'w') as file:
        pd.read_excel(insert_excel_file_name_with_extension).to_string(file, index=False)
    df=pd.read_csv('text_file_name.txt', delimiter = "\t")
    new_word_list=[] #empty word list
    for i in range(len(df[df.columns[0]].tolist())):
        new_word_list.append(df[df.columns[0]].tolist()[i].replace(" ",""))
    return new_word_list

### Let's see if the function works

In [None]:
unique_product_names_in_excel_file('2022_01_27_product_name_from_PDF.xlsx')

### This works great! So, now all we need to do is match our text from PDF to this new list. For that we write the following function

In [None]:
def match_words_from_a_list(list_of_words,extracted_string):
    matched_list=[] #list of words in the string that matches with list_of_words
    if len(list_of_words)>0:
        for i in range(len(list_of_words)):
            if list_of_words[i].lower() in extracted_string.lower():
                matched_list.append(list_of_words[i])
        if len(matched_list)==0:
            return 'Product name does not match with existing records.'
        else:
            string='Possible product can be'
            if len(matched_list)==1:
                string+=' '+matched_list[0]
            else:
                for j in range(len(matched_list)-1):
                    string+=' '+matched_list[j]+' or'
                string+=' '+matched_list[len(matched_list)-1]
            return string
    else:
        return 'There is no product name in the list'

# Finally let us merge everything into a single function

In [None]:
#Matching words from a list of words in a text file
def find_product_names_from_a_PDF(insert_excel_file_name_with_extension,PDF_file_name_w_extension):
    extracted_string=extract_text_from_PDF(PDF_file_name_w_extension)
    return match_words_from_a_list(unique_product_names_in_excel_file(insert_excel_file_name_with_extension),extracted_string)

# Finally, let's see if this works

In [None]:
find_product_names_from_a_PDF('2022_01_27_product_name_from_PDF.xlsx','SDS_Benzene.pdf')

# Amazing! so, this code works fine and can give us possible product names by matching with an excel file.