In [None]:
!pip install -e git+https://github.com/mindee/doctr.git#egg=python-doctr[torch]

[33mDEPRECATION: git+https://github.com/mindee/doctr.git#egg=python-doctr[torch] contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mObtaining python-doctr[torch] from git+https://github.com/mindee/doctr.git#egg=python-doctr[torch] (from python-doctr[torch])
  Cloning https://github.com/mindee/doctr.git to ./src/python-doctr
  Running command git clone --filter=blob:none --quiet https://github.com/mindee/doctr.git /content/src/python-doctr
  Resolved https://github.com/mindee/doctr.git to commit d7f453329f583798c0d2774f343e29ab24450a4d
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25

In [None]:
!pip install mplcursors

Collecting mplcursors
  Downloading mplcursors-0.5.3.tar.gz (88 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting matplotlib!=3.7.1,>=3.1 (from mplcursors)
  Downloading matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mplcursors
  Building wheel for mplcursors (pyproject.toml) ... [?25l[?25hdone
  Created wheel for 

In [None]:
import os
import gdown
import requests
import pandas as pd
import concurrent.futures
import numpy as np
import logging
import torch
import signal
import re
from io import BytesIO

In [None]:
os.environ['USE_TORCH'] = '1'
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

### Doctr Model for extracting Text from pdfs

In [None]:
# Check for GPU availability and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the OCR model and move it to the appropriate device
model = ocr_predictor(pretrained=True)
model.to(device)  # Ensure the model is on the GPU if available

OCRPredictor(
  (det_predictor): DetectionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(1024, 1024), interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=True)
      (normalize): Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287))
    )
    (model): FAST()
  )
  (reco_predictor): RecognitionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(32, 128), interpolation='bilinear', preserve_aspect_ratio=True, symmetric_pad=False)
      (normalize): Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301))
    )
    (model): CRNN(
      (feat_extractor): Sequential(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): BatchNorm2d(64, eps=1e-05, momentum=

In [None]:
# Load the Excel file
excel_data = pd.ExcelFile('/content/data.xlsx')

# Load the sheets into separate DataFrames
train_df = pd.read_excel(excel_data, sheet_name='train_data')
test_df = pd.read_excel(excel_data, sheet_name='test_data')

In [None]:
# Display the DataFrames
print("Train DataFrame:")
train_df.info()

print("\nTest DataFrame:")
test_df.info()

Train DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1895 entries, 0 to 1894
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   datasheet_link  1895 non-null   object
 1   target_col      1895 non-null   object
dtypes: object(2)
memory usage: 29.7+ KB

Test DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   datasheet_link  400 non-null    object
 1   target_col      400 non-null    object
dtypes: object(2)
memory usage: 6.4+ KB


#### Dropping duplicates

In [None]:
duplicate_count = train_df['datasheet_link'].duplicated().sum()
print(f"Number of duplicate values: {duplicate_count}")

Number of duplicate values: 697


In [None]:
train_df.drop_duplicates(subset='datasheet_link', inplace=True)

In [None]:
# Check for missing values in the 'datasheet_link' column
missing_links = train_df['datasheet_link'].isnull()
num_missing_links = missing_links.sum()

print(f"Number of missing values in 'datasheet_link' column: {num_missing_links}")

Number of missing values in 'datasheet_link' column: 0


#### Divinding train data into 3 seperate dataframes

In [None]:
import numpy as np

train_df1, train_df2, train_df3 = np.array_split(train_df, 3)

print(f"train_df1: {len(train_df1)} rows")
print(f"train_df2: {len(train_df2)} rows")
print(f"train_df3: {len(train_df3)} rows")

train_df1: 400 rows
train_df2: 399 rows
train_df3: 399 rows


  return bound(*args, **kwds)


In [None]:
# Global counter
pdf_counter = 0

# Exception for handling timeout
class TimeoutException(Exception):
    pass

# Signal handler for timeout
def timeout_handler(signum, frame):
    raise TimeoutException()

# Setting the signal handler for the alarm
signal.signal(signal.SIGALRM, timeout_handler)

def fix_url(url):
    """Fix URLs that are missing a scheme."""
    if not re.match(r'https?:', url):
        url = 'https://' + url.lstrip('/')
    return url

def extract_text_from_pdf(pdf_url, model):
    global pdf_counter
    pdf_counter += 1
    row_number = pdf_counter  # Use the global counter to get the current row number
    print(f"Processing row {row_number} with URL: {pdf_url}...")

    # Fixing the URL if needed
    pdf_url = fix_url(pdf_url)

    # Setting a 2-minute timeout
    signal.alarm(10)

    try:
        # Downloading the PDF from the URL
        pdf_path = gdown.download(pdf_url, 'temp.pdf', quiet=True)

        # Using Doctr to extract text
        document = DocumentFile.from_pdf(pdf_path)
        result = model(document)
        json_response = result.export()

        # Extracting text from the first two pages of the OCR result
        values = []
        num_pages_to_process = min(2, len(json_response['pages']))  # Ensure we don't go beyond available pages
        for page_index in range(num_pages_to_process):
            page = json_response['pages'][page_index]
            for block in page['blocks']:
                for line in block['lines']:
                    for word in line['words']:
                        values.append(word['value'])

        # Resetting the alarm
        signal.alarm(0)

        return " ".join(values)
    except TimeoutException:
        print(f"Timeout reached for row {row_number}. Skipping PDF: {pdf_url}")
        return "" 
    except Exception as e:
        if 'Invalid URL' in str(e):
            suggested_url = re.search(r'Perhaps you meant (https?:[^\s]+)', str(e))
            if suggested_url:
                suggested_url = suggested_url.group(1)
                print(f"Trying suggested URL: {suggested_url}")
                return extract_text_from_pdf(suggested_url, model) 
        print(f"Error processing PDF at row {row_number}: {e}")
        return ""  

train_df['text'] = train_df['datasheet_link'].apply(lambda link: extract_text_from_pdf(link, model))

# Saving the final result
train_df.to_csv('data_extracted.csv', index=False)
print("Processing completed and final result saved.")

Processing row 1 with URL: https://www.waclighting.com/product/emc2/?download=specs3&1676585363...
Processing row 2 with URL: https://www.waclighting.com/product/line/?download=specs3&1676602764...
Processing row 3 with URL: https://www.waclighting.com/product/stack/?download=specs3&1676620368...
Processing row 4 with URL: https://www.waclighting.com/storage/waclighting-images/specsheet_pdf/DC-PD05-CC_SPSHT.pdf...
Processing row 5 with URL: https://www.waclighting.com/storage/waclighting-images/specsheet_pdf/DC-PD05_SPSHT.pdf...
Processing row 6 with URL: https://www.waclighting.com/storage/waclighting-images/specsheet_pdf/DS-PD05_SPSHT.pdf...
Processing row 7 with URL: https://www.waclighting.com/product/loophole/?download=specs3&1676579637...
Processing row 8 with URL: https://www.waclighting.com/storage/waclighting-images/specsheet_pdf/DC-PD06_SPSHT.pdf...
Processing row 9 with URL: https://www.waclighting.com/storage/waclighting-images/specsheet_pdf/DS-PD06_SPSHT.pdf...
Processing 