In [24]:
# !pip install pywin32 
# !pip install pdf2docx

In [None]:
# Convert all the doc files to docx files
import os
import win32com.client

def convert_doc_to_docx(doc_path):
    """Converts a .doc file to .docx in the same directory."""
    try:
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False  # Run in the background
        
        docx_path = doc_path + "x"  # Append 'x' to get .docx
        if os.path.exists(docx_path):
            print(f"Skipped (already exists): {docx_path}")
            return

        print(f"Converting: {doc_path} -> {docx_path}")
        doc = word.Documents.Open(doc_path)
        doc.SaveAs(docx_path, FileFormat=16)  # FileFormat=16 = .docx
        doc.Close()
        word.Quit()
        
        print(f"Converted successfully: {docx_path}")
    except Exception as e:
        print(f"Error converting {doc_path}: {e}")
    os.remove(doc_path)


def process_folders(root_folder):
    """Scans for .doc files in all subfolders and converts them."""
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.lower().endswith(".doc") and not filename.lower().endswith(".docx"):
                doc_path = os.path.join(foldername, filename)
                convert_doc_to_docx(doc_path)

# Example usage: Change this to your root folder containing all 4 subfolders
root_folder = r"C:\Resumes_Docx - Copy"  # Update this path
process_folders(root_folder)

Converting: E:\Other Projects\Resumes_Docx - Copy\Peoplesoft Resume\Peoplesoft Admin_Vinod Akkala.doc -> E:\Other Projects\Resumes_Docx - Copy\Peoplesoft Resume\Peoplesoft Admin_Vinod Akkala.docx


In [3]:
# Convert all the pdfs into docx
import os
from pdf2docx import Converter

def convert_pdf_to_docx(pdf_path):
    """Converts a .pdf file to .docx in the same directory."""
    try:
        docx_path = os.path.splitext(pdf_path)[0] + ".docx"  # Change extension to .docx
        
        if os.path.exists(docx_path):
            print(f"Skipped (already exists): {docx_path}")
            return
        
        print(f"Converting: {pdf_path} -> {docx_path}")
        cv = Converter(pdf_path)
        cv.convert(docx_path, start=0, end=None)
        cv.close()
        
        print(f"Converted successfully: {docx_path}")
    except Exception as e:
        print(f"Error converting {pdf_path}: {e}")
    os.remove(pdf_path)

def process_folders(root_folder):
    """Scans for .pdf files in all subfolders and converts them."""
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(foldername, filename)
                convert_pdf_to_docx(pdf_path)

# Example usage: Change this to your root folder containing all 4 subfolders
root_folder = r"E:\Other Projects\Resumes_Docx_2"  # Update this path
process_folders(root_folder)

[INFO] Start to convert E:\Other Projects\Resumes_Docx_2\React Developer\Reactjs Developer_Prabakaran_Musquare Technologies.pdf
[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m


Converting: E:\Other Projects\Resumes_Docx_2\React Developer\Reactjs Developer_Prabakaran_Musquare Technologies.pdf -> E:\Other Projects\Resumes_Docx_2\React Developer\Reactjs Developer_Prabakaran_Musquare Technologies.docx


[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] [1;36m[4/4] Creating pages...[0m
[INFO] (1/2) Page 1
[INFO] (2/2) Page 2
[INFO] Terminated in 1.91s.


Converted successfully: E:\Other Projects\Resumes_Docx_2\React Developer\Reactjs Developer_Prabakaran_Musquare Technologies.docx


In [5]:
# Getting the data frame having path 
import os
import pandas as pd

def get_all_file_paths(folder_path):
    """Returns a DataFrame containing all file paths in the given folder (including subdirectories)."""
    file_paths = []

    # Walk through directory and subdirectories
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))  # Construct full file path

    # Create a DataFrame
    df = pd.DataFrame(file_paths, columns=["file_path"])
    return df

# Example usage
folder_path = "E:\Other Projects\Resumes_Docx_2"  # Replace with your folder path
df = get_all_file_paths(folder_path)

# Display DataFrame
df

  folder_path = "E:\Other Projects\Resumes_Docx_2"  # Replace with your folder path


Unnamed: 0,file_path
0,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...
1,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...
2,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...
3,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...
4,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...
...,...
74,E:\Other Projects\Resumes_Docx_2\workday\Sri K...
75,E:\Other Projects\Resumes_Docx_2\workday\Srika...
76,E:\Other Projects\Resumes_Docx_2\workday\SSKum...
77,E:\Other Projects\Resumes_Docx_2\workday\Venka...


In [45]:
import re
from docx import Document

def extract_text_from_docx(docx_path):
    """Extracts raw text from a .docx file, including tables, and removes extra spaces/newlines/tabs."""
    try:
        doc = Document(docx_path)
        
        # Extract text from paragraphs
        text = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

        # Extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
                if row_text:
                    text.append(" | ".join(row_text))  # Join table cells with "|"

        # Join text with space and clean up excessive whitespace
        cleaned_text = " ".join(text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces/newlines/tabs with a single space
        
        return cleaned_text.strip()
    
    except Exception as e:
        return f"Error reading DOCX: {e}"


In [47]:
df['text'] = df['file_path'].apply(extract_text_from_docx)
# df.head()

In [49]:
df['target'] = df['file_path'].apply(lambda x: x.split('\\')[3])

In [51]:
df.head()

Unnamed: 0,file_path,text,target
0,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...,Anubhav Kumar Singh To work in a globally comp...,Peoplesoft Resume
1,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...,Profile Summary: 7+ years of experience in imp...,Peoplesoft Resume
2,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...,PeopleSoft Database Administrator Gangareddy P...,Peoplesoft Resume
3,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...,Murali Experience Summary I have 6 years of ex...,Peoplesoft Resume
4,E:\Other Projects\Resumes_Docx_2\Peoplesoft Re...,"Priyanka Ramadoss 61/46, MountPleasant, Coonoo...",Peoplesoft Resume


In [57]:
df.to_csv('data_set_csv.csv', index=False)