In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\equitas.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Anil Kumar
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Arun Ramanathan
Director Type: Executive
DIN Numbers:
- 00308848
-------------------


In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\polyplex.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Yogesh Kapur
Director Type: Executive
DIN Numbers:
-------------------


In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\Triveni1.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Amrita
Gangotra
Director Type: Independent
DIN Numbers:
- 08333492
-------------------
Director Name: Sonu Halan Bhasin
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Dhruv M. Sawhney
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Mahindra First Choice Wheels
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Amrita Gangotra
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Nikhil Sawhney
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Hosiery Complex
Director Type: Executive
DIN Numbers:
-------------------


In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\Triveni2.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Bandra-Kurla Complex
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Amrita Gangotra
Director Type: Executive
DIN Numbers:
- 08333492
-------------------
Director Name: Dhruv M.
Director Type: Executive
DIN Numbers:
-------------------
Director Name: NM Joshi Marg
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Dhruv M.
Sawhney
Director Type: Executive
DIN Numbers:
- 00102999
-------------------
Director Name: Dhruv M.
Resolution
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Schedule V
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Noida Rajiv Sawhney
Date
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Pallavi Mhatre
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Aadhar Card
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Homai A. Daruwalla
Director Type: Independent
DIN Numbers:
- 0036

In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\vinati.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Prashant Barve
Director Type: Executive
DIN Numbers:
-------------------
Director Name: M. Lakshmi
Director Type: Executive
DIN Numbers:
-------------------
Director Name: J. C. Laddha
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Item Nos
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Lagnam Spintex Limited
Director Type: Independent
DIN Numbers:
-------------------


In [None]:
import spacy
import pdfplumber
import re

# Load English NER model
nlp = spacy.load("en_core_web_sm")


def extract_names(text):
    doc = nlp(text)
    names = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON" and len(ent.text.split()) > 1 and ent.text[0].isupper():
            names.add(ent.text)
    return names


# Function to extract text from PDF and apply NER
def extract_names_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()

    # Extract names from the text
    names = extract_names(text)
    return names


# Function to filter names based on their appearance in sentences containing "director"
def filter_director_info(names, text):
    director_info = {}
    doc = nlp(text)
    for sentence in doc.sents:
        if "director" in sentence.text.lower():
            for name in names:
                if name in sentence.text:
                    # Extract director type
                    director_type = classify_director_type(sentence.text)
                    # Extract DIN numbers associated with the director
                    director_din = extract_din_for_director(sentence.text, name)
                    # Add director information to the dictionary
                    director_info[name] = (director_type, director_din)
                    break  # Exit the loop after finding the first name in the sentence
    return director_info


# Placeholder function for classification of director types
def classify_director_type(sentence_text):
    # Placeholder function for classification of director types
    # You can use rule-based or keyword-based techniques
    # Return "Independent" or "Executive" based on the text
    # Example:
    if "independent" in sentence_text.lower():
        return "Independent"
    else:
        return "Executive"


# Function to extract DIN numbers associated with directors
def extract_din_for_director(text, director_name):
    din_pattern = re.compile(r'\b\d{8}\b')  # Assuming DIN is an 8-digit number
    din_matches = din_pattern.findall(text)
    director_dins = set()  # Use a set to store unique DIN numbers
    for din in din_matches:
        # Check if director's name is in the text near the DIN number
        if director_name.lower() in text[text.find(din) - 50:text.find(din) + 50].lower():
            director_dins.add(din)
    return director_dins


# Example usage:
pdf_path = "C:\\Users\\User\\Downloads\\IPBL\\pdfextractor\\data\\pdf_files\\varun.pdf"
all_names = extract_names_from_pdf(pdf_path)
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

director_info = filter_director_info(all_names, text)

# Print director names, types, and DIN numbers separately
for name, (director_type, director_din) in director_info.items():
    print("Director Name:", name)
    print("Director Type:", director_type)
    print("DIN Numbers:")
    for din in director_din:
        print("-", din)
    print("-------------------")

Director Name: Raj Gandhi
Director Type: Executive
DIN Numbers:
- 00003649
-------------------
Director Name: Varun Jaipuria
Director Type: Executive
DIN Numbers:
-------------------
Director Name: Rajinder Jeet Singh Bagga
Director Type: Executive
DIN Numbers:
- 08440479
-------------------
Director Name: Naresh Trehan
Director Type: Independent
DIN Numbers:
-------------------
Director Name: Bagga Ravi Batra
Director Type: Executive
DIN Numbers:
-------------------
