In [1]:
import os
import fitz
import pytesseract
from PIL import Image
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np 

pdf_folder_paths = ['Eyewear','Jewellery']  

pytesseract.pytesseract.tesseract_cmd = 'C:\\Users\\hp\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img)
    return text 

def process_pdfs(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = pdf_to_text(pdf_path)
            data.append({'Filename': filename, 'Extracted Text': text, 'Folder': folder_path})
            print(f"OCR completed for {filename} in folder {folder_path}")
    return data 

if __name__ == "__main__":
    all_extracted_data = []
    for folder_path in pdf_folder_paths:
        extracted_data = process_pdfs(folder_path)
        all_extracted_data.extend(extracted_data)  
    
    df = pd.DataFrame(all_extracted_data)
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def bert_encode(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].detach().numpy()

    bert_embeddings = df['Extracted Text'].apply(bert_encode)
    
    bert_embeddings_2d = np.array(bert_embeddings.tolist()).reshape(len(df), -1)
    bert_embeddings_df = pd.DataFrame(bert_embeddings_2d, index=df.index)

    df_final = pd.concat([df, bert_embeddings_df], axis=1)
    
    df_labels = pd.get_dummies(df['Folder'])
    
    output_excel_path = 'extracted_data_with_bert.xlsx'
    df_final.to_excel(output_excel_path, index=False, encoding="utf-8")
    print(f"All extracted data with BERT embeddings saved to {output_excel_path}")

OCR completed for 002670-Rahul-Bhardwaj.pdf in folder Eyewear
OCR completed for 002671-Alok-Mehta.pdf in folder Eyewear
OCR completed for 002672-Pooja-Agarwal.pdf in folder Eyewear
OCR completed for 002673-Siddharth-Sharma.pdf in folder Eyewear
OCR completed for 002674-Sneha-Kapoor.pdf in folder Eyewear
OCR completed for 002675-Rohan-Deshmukh.pdf in folder Eyewear
OCR completed for 002676-Anushka-Rao.pdf in folder Eyewear
OCR completed for 002677-Harshita-Patel.pdf in folder Eyewear
OCR completed for 002678-Arjun-Das.pdf in folder Eyewear
OCR completed for 002679-Mira-Sethi.pdf in folder Eyewear
OCR completed for 04351-Rohit-Chawla.pdf in folder Eyewear
OCR completed for 04352-Meera-Kapoor.pdf in folder Eyewear
OCR completed for 04353-Aakash-Mehta.pdf in folder Eyewear
OCR completed for 04354-Suman-Mishra.pdf in folder Eyewear
OCR completed for 04355-Poonam-Singh.pdf in folder Eyewear
OCR completed for 04356-Rajesh-Verma.pdf in folder Eyewear
OCR completed for 04357-Sunita-Sharma.pdf i

OCR completed for B-4710-Anjali-Desai.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-24-58_1.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-00_2.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-01_3.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-02_4.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-03_5.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-04_6.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-05_7.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-06_8.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-07_9.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-08_10.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-09_11.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-10_12.pdf in folder Eyewear
OCR completed for Invoice_2024-06-24_11-25-11_13.pdf in folder Eyewear
OCR completed for Invoice_

OCR completed for Invoice 0000035.pdf in folder Jewellery
OCR completed for Invoice 0000036.pdf in folder Jewellery
OCR completed for Invoice 0000037.pdf in folder Jewellery
OCR completed for Invoice 0000038.pdf in folder Jewellery
OCR completed for Invoice 0000039.pdf in folder Jewellery
OCR completed for Invoice 0000040.pdf in folder Jewellery
OCR completed for Invoice 0000041.pdf in folder Jewellery
OCR completed for Invoice 0000042.pdf in folder Jewellery
OCR completed for Invoice 0000043.pdf in folder Jewellery
OCR completed for Invoice 0000044 (1).pdf in folder Jewellery
OCR completed for Invoice 0000044.pdf in folder Jewellery
OCR completed for Invoice 0000045.pdf in folder Jewellery
OCR completed for Invoice 0000046.pdf in folder Jewellery
OCR completed for Invoice 0000047.pdf in folder Jewellery
OCR completed for Invoice 0000048.pdf in folder Jewellery
OCR completed for Invoice 0000049.pdf in folder Jewellery
OCR completed for Invoice 0000050.pdf in folder Jewellery
OCR comple

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


All extracted data with BERT embeddings saved to extracted_data_with_bert.xlsx


In [2]:
df_labels = pd.get_dummies(df['Folder'])
df_labels

Unnamed: 0,Eyewear,Jewellery
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
325,0,1
326,0,1
327,0,1
328,0,1


In [3]:
bert_embeddings_2d = np.array(bert_embeddings.tolist()).reshape(len(df), -1)
bert_embeddings_df = pd.DataFrame(bert_embeddings_2d, index=df.index)
bert_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.400837,0.215604,0.418017,-0.043882,0.023066,-0.086297,0.236101,0.143838,-0.001163,-0.236509,...,0.257714,-0.379453,0.163845,-0.508641,0.295606,0.157767,-0.433764,0.012302,0.474184,0.407766
1,-0.291238,0.236411,0.325662,-0.032051,-0.254698,-0.033549,0.19826,0.183174,-0.197276,-0.07753,...,0.134772,-0.29982,0.096772,-0.521734,0.343332,0.158354,-0.343207,-0.059194,0.346474,0.29372
2,-0.173456,0.148845,0.345017,-0.012085,-0.146393,0.116475,0.347428,0.117311,-0.114183,0.050514,...,0.0549,-0.362175,0.219588,-0.510667,0.297454,0.182644,-0.410872,-0.056034,0.392413,0.317324
3,-0.397714,0.332922,0.346208,-0.060357,-0.077247,0.038411,0.150583,0.129823,-0.124087,-0.039386,...,0.150542,-0.141827,0.07519,-0.342704,0.311526,0.15016,-0.26402,-0.036177,0.327253,0.364072
4,-0.263603,0.135638,0.393989,-0.093767,-0.093364,-0.155405,0.332288,0.18712,0.012943,-0.243704,...,0.026532,-0.294774,0.121734,-0.317458,0.340231,0.204337,-0.352027,-0.146192,0.249358,0.30316


In [4]:
df_final = pd.concat([df, bert_embeddings_df], axis=1)
df_final.head()

Unnamed: 0,Filename,Extracted Text,Folder,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,002670-Rahul-Bhardwaj.pdf,"GKB Opticals\n\nNo 20, Ground Floor, Brigade R...",Eyewear,-0.400837,0.215604,0.418017,-0.043882,0.023066,-0.086297,0.236101,...,0.257714,-0.379453,0.163845,-0.508641,0.295606,0.157767,-0.433764,0.012302,0.474184,0.407766
1,002671-Alok-Mehta.pdf,"GKB Opticals\n\nNo 20, Ground Floor, Brigade R...",Eyewear,-0.291238,0.236411,0.325662,-0.032051,-0.254698,-0.033549,0.19826,...,0.134772,-0.29982,0.096772,-0.521734,0.343332,0.158354,-0.343207,-0.059194,0.346474,0.29372
2,002672-Pooja-Agarwal.pdf,"GKB Opticals\n\nNo 20, Ground Floor, Brigade R...",Eyewear,-0.173456,0.148845,0.345017,-0.012085,-0.146393,0.116475,0.347428,...,0.0549,-0.362175,0.219588,-0.510667,0.297454,0.182644,-0.410872,-0.056034,0.392413,0.317324
3,002673-Siddharth-Sharma.pdf,"GKB Opticals\n\nNo 20, Ground Floor, Brigade R...",Eyewear,-0.397714,0.332922,0.346208,-0.060357,-0.077247,0.038411,0.150583,...,0.150542,-0.141827,0.07519,-0.342704,0.311526,0.15016,-0.26402,-0.036177,0.327253,0.364072
4,002674-Sneha-Kapoor.pdf,"GKB Opticals\n\nNo 20, Ground Floor, Brigade R...",Eyewear,-0.263603,0.135638,0.393989,-0.093767,-0.093364,-0.155405,0.332288,...,0.026532,-0.294774,0.121734,-0.317458,0.340231,0.204337,-0.352027,-0.146192,0.249358,0.30316


In [5]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# Combine the embeddings with labels
X = df_final[bert_embeddings_df.columns]
y = df_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

logistic_regressor = LogisticRegression(solver='liblinear',penalty='l1' , C=0.5)

multi_target_classifier = MultiOutputClassifier(logistic_regressor, n_jobs=None)
multi_target_classifier.fit(X_train, y_train)

y_pred = multi_target_classifier.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=df_labels.columns))

Classification Report:
              precision    recall  f1-score   support

     Eyewear       1.00      1.00      1.00        75
   Jewellery       1.00      1.00      1.00        24

   micro avg       1.00      1.00      1.00        99
   macro avg       1.00      1.00      1.00        99
weighted avg       1.00      1.00      1.00        99
 samples avg       1.00      1.00      1.00        99



In [6]:
multi_target_classifier.score(X_train, y_train)

0.9986798679867986

In [7]:
multi_target_classifier.score(X_test, y_test)

0.99153194765204

In [10]:
import numpy as np
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

X, y = make_multilabel_classification(n_samples=1000, n_classes=6, n_labels=2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))  

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=df_labels.columns))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Classification Report:
              precision    recall  f1-score   support

     Eyewear       0.72      0.26      0.38        88
        FMCG       0.85      0.55      0.67       149
    Footwear       0.96      0.40      0.57       134
          IT       0.90      0.22      0.35       127
   Jewellery       0.90      0.20      0.33        44
      Pharma       0.75      0.12      0.21        49

   micro avg       0.87      0.34      0.49       591
   macro avg       0.85      0.29      0.42       591
weighted avg       0.86      0.34      0.47       591
 samples avg       0.67      0.40      0.47       591



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
