The data was collected for each person in the sample iteratively and stored in the zip file. Before the operation, the data needs to be transformed into a dataframe. 

In [None]:
def collect_zip_files(source_folder, destination_folder):
    source_path = Path(source_folder)
    destination_path = Path(destination_folder)
    destination_path.mkdir(exist_ok=True)
    
    total_files_collected = 0
    
    for subfolder in source_path.iterdir():
        if subfolder.is_dir():
            zip_files = list(subfolder.glob('*.zip')) + list(subfolder.glob('*.ZIP'))
            
            for zip_file in zip_files:
                original_name = zip_file.name
                destination_file = destination_path / original_name
                
                counter = 1
                base_name = zip_file.stem
                extension = zip_file.suffix
                
                while destination_file.exists():
                    new_name = f"{base_name}_{counter}{extension}"
                    destination_file = destination_path / new_name
                    counter += 1
                
                shutil.copy2(zip_file, destination_file)
                total_files_collected += 1
    
    print(f"Total zip files collected: {total_files_collected}")

source_folder = "/Users/shantanusharma/Desktop/untitled folder 2"
destination_folder = "/Users/shantanusharma/Desktop/Data Coolection Docs"
collect_zip_files(source_folder, destination_folder)

In [None]:
def read_docx_file(file_path: str) -> str:
    """Read a .docx file and convert it to plain text."""
    try:
        doc = Document(file_path)
        full_text = []
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                full_text.append(paragraph.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""
    
def extract_document_components(document_text: str) -> Dict[str, any]:
    """Extract source name, body text, and year from document."""
    # Extract source name
    lines = document_text.split('\n')
    source_name = None
    
    for i, line in enumerate(lines):
        line = line.strip()
        if line and i + 1 < len(lines):
            next_line = lines[i + 1].strip()
            if re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December).*\d{4}', next_line):
                if not line.startswith('#') and not line.startswith('!') and not line.isupper():
                    source_name = line
                    break
    
    # Extract body text
    body_start_pattern = r'(?:Body|\*\*Body\*\*|__Body__|Section:\s*Body)'
    body_end_pattern = r'(?:Load-Date:|Load Date:|__Load-Date__|Load-Date)'
    
    body_match = re.search(f'{body_start_pattern}(.*?){body_end_pattern}', 
                          document_text, re.DOTALL | re.IGNORECASE)
    
    if body_match:
        body_text = body_match.group(1).strip()
    else:
        fallback_match = re.search(r'Byline:.*?\n(.*?)(?:Load-Date|Load Date)', 
                                  document_text, re.DOTALL | re.IGNORECASE)
        body_text = fallback_match.group(1).strip() if fallback_match else ""
    
    # Clean body text
    body_text = re.sub(r'\*\*|\[\]{\.underline}|\{\.underline\}', '', body_text)
    body_text = re.sub(r'\s+', ' ', body_text).strip()
    
    # Extract year
    year_patterns = [
        r'Load-Date:.*?(\d{4})',
        r'Load Date:.*?(\d{4})',
        r'__Load-Date__.*?(\d{4})',
        r'(\d{4})\s*End of Document'
    ]
    
    year = None
    for pattern in year_patterns:
        year_match = re.search(pattern, document_text, re.IGNORECASE)
        if year_match:
            year = int(year_match.group(1))
            break
    
    return {
        'source_name': source_name,
        'body_text': body_text,
        'year': year
    }

def extract_person_sentences_with_context(text: str, last_name: str, nlp) -> str:
    """Extract sentences containing the person's name with context."""
    if not text or pd.isna(text):
        return ""
    
    doc = nlp(text)
    
    # Create name variations
    name_variations = [last_name]
    name_variations.extend([f"Mr. {last_name}", f"Ms. {last_name}", 
                           f"Mrs. {last_name}", f"Dr. {last_name}"])
    
    relevant_text_parts = []
    sentences = list(doc.sents)
    
    for i, sent in enumerate(sentences):
        sent_text = sent.text.strip()
        if any(name.lower() in sent_text.lower() for name in name_variations):
            # Add previous sentence if exists
            if i > 0:
                relevant_text_parts.append(sentences[i-1].text.strip())
            # Add current sentence
            relevant_text_parts.append(sent_text)
            # Add next sentence if exists
            if i < len(sentences) - 1:
                relevant_text_parts.append(sentences[i+1].text.strip())
    
    # Remove duplicates while preserving order
    seen = set()
    relevant_text_cleaned = []
    for text in relevant_text_parts:
        if text not in seen:
            seen.add(text)
            relevant_text_cleaned.append(text)
    
    return ' '.join(relevant_text_cleaned) if relevant_text_cleaned else ""

# ===== TEXT PREPROCESSING =====

def preprocess_text(text: str) -> str:
    """Preprocess text according to specified steps."""
    if pd.isna(text) or not text:
        return ""
    
    # 1. Convert special characters to ASCII equivalents
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    
    # 2. Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # 3. Remove words starting with @ or #
    text = re.sub(r'[@#]\w+', '', text)
    
    # 4. Remove non-English characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # 5. Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # 6. Remove extra whitespace
    text = ' '.join(text.split())
    
    # 7. Convert to lowercase
    text = text.lower()
    
    # 8. Remove extra punctuation
    text = re.sub(r'([!?.]){2,}', r'\1', text)
    
    return text.strip()

def collect_all_ceo_documents(main_folder_path: str) -> pd.DataFrame:
    """
    Collect all documents from CEO subfolders and create initial dataframe.
    
    Args:
        main_folder_path: Path to main folder containing CEO subfolders
    
    Returns:
        DataFrame with columns: CEO_Name, Source, Year, Body_Text, File_Path
    """
    all_documents = []
    
    # Get all subfolders (CEO folders)
    ceo_folders = [f for f in os.listdir(main_folder_path) 
                   if os.path.isdir(os.path.join(main_folder_path, f))]
    
    print(f"Found {len(ceo_folders)} CEO folders")
    
    # Process each CEO folder
    for ceo_folder in tqdm(ceo_folders, desc="Processing CEO folders"):
        # Extract CEO name from folder name
        ceo_name = ceo_folder.replace('_', ' ')
        
        ceo_folder_path = os.path.join(main_folder_path, ceo_folder)
        
        # Get all .docx files in the folder
        docx_files = [f for f in os.listdir(ceo_folder_path) 
                     if f.lower().endswith('.docx') and not f.startswith('~')]
        
        print(f"\n{ceo_name}: Found {len(docx_files)} documents")
        
        # Process each document
        for docx_file in docx_files:
            file_path = os.path.join(ceo_folder_path, docx_file)
            
            # Read document
            doc_text = read_docx_file(file_path)
            
            if not doc_text:
                continue
            
            # Extract components
            components = extract_document_components(doc_text)
            
            # Add to list
            all_documents.append({
                'CEO_Name': ceo_name,
                'Source': components['source_name'],
                'Year': components['year'],
                'Body_Text': components['body_text'],
                'File_Path': file_path,
                'Filename': docx_file
            })
    
    # Create DataFrame
    df_raw = pd.DataFrame(all_documents)
    
    print(f"\nTotal documents collected: {len(df_raw)}")
    print(f"CEOs with data: {df_raw['CEO_Name'].nunique()}")
    print(f"Sources found: {df_raw['Source'].nunique()}")
    print(f"Year range: {df_raw['Year'].min()} - {df_raw['Year'].max()}")
    
    return df_raw

def create_ner_extracted_dataframe(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Apply NER extraction with context to create extracted text dataframe.
    
    Args:
        df_raw: Raw dataframe with body text
    
    Returns:
        DataFrame with NER-extracted text
    """
    print("\nApplying NER extraction with context...")
    
    extracted_data = []
    
    # Group by CEO for efficient processing
    for ceo_name, group in tqdm(df_raw.groupby('CEO_Name'), desc="Processing CEOs"):
        # Get last name for NER
        last_name = ceo_name.split()[-1]
        
        for idx, row in group.iterrows():
            # Extract relevant sentences with context
            extracted_text = extract_person_sentences_with_context(
                row['Body_Text'], last_name, nlp
            )
            
            if extracted_text:  # Only add if text was found
                extracted_data.append({
                    'CEO_Name': ceo_name,
                    'Source': row['Source'],
                    'Year': row['Year'],
                    'Extracted_Text': extracted_text,
                    'Filename': row['Filename']
                })
    
    df_extracted = pd.DataFrame(extracted_data)
    
    print(f"\nNER extraction complete!")
    print(f"Documents with extracted text: {len(df_extracted)}")
    print(f"Documents without matches: {len(df_raw) - len(df_extracted)}")
    
    return df_extracted

def create_preprocessed_dataframe(df_extracted: pd.DataFrame, min_words: int = 50) -> pd.DataFrame:
    """
    Create preprocessed version of extracted text dataframe.
    
    Args:
        df_extracted: DataFrame with NER-extracted text
        min_words: Minimum word count threshold
    
    Returns:
        DataFrame with preprocessed text
    """
    print("\nPreprocessing extracted text...")
    
    # Apply preprocessing
    df_preprocessed = df_extracted.copy()
    df_preprocessed['Preprocessed_Text'] = df_preprocessed['Extracted_Text'].apply(preprocess_text)
    
    # Calculate word count
    df_preprocessed['Word_Count'] = df_preprocessed['Preprocessed_Text'].apply(
        lambda x: len(str(x).split()) if x else 0
    )
    
    # Filter by word count
    initial_count = len(df_preprocessed)
    df_preprocessed = df_preprocessed[df_preprocessed['Word_Count'] >= min_words].copy()
    
    print(f"\nPreprocessing complete!")
    print(f"Documents before filtering: {initial_count}")
    print(f"Documents after {min_words}-word filter: {len(df_preprocessed)}")
    print(f"Average word count: {df_preprocessed['Word_Count'].mean():.1f}")
    
    # Remove word count column if not needed
    df_preprocessed = df_preprocessed.drop('Word_Count', axis=1)
    
    return df_preprocessed

def run_data_collection_pipeline(main_folder_path: str, output_folder: str = "output") -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Run the complete data collection pipeline.
    
    Args:
        main_folder_path: Path to main folder containing CEO subfolders
        output_folder: Folder to save output files
    
    Returns:
        Tuple of (raw_df, extracted_df, preprocessed_df)
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    print("="*80)
    print("STARTING CEO DATA COLLECTION PIPELINE")
    print("="*80)
    
    # Step 1: Collect all documents
    print("\nSTEP 1: Collecting all documents...")
    df_raw = collect_all_ceo_documents(main_folder_path)
    
    # Save raw dataframe
    raw_file = os.path.join(output_folder, "ceo_articles_raw.csv")
    df_raw.to_csv(raw_file, index=False)
    print(f"Raw data saved to: {raw_file}")
    
    # Step 2: Apply NER extraction
    print("\nSTEP 2: Applying NER extraction...")
    df_extracted = create_ner_extracted_dataframe(df_raw)
    
    # Save extracted dataframe
    extracted_file = os.path.join(output_folder, "ceo_articles_extracted.csv")
    df_extracted.to_csv(extracted_file, index=False)
    print(f"Extracted data saved to: {extracted_file}")
    
    # Step 3: Preprocess text
    print("\nSTEP 3: Preprocessing text...")
    df_preprocessed = create_preprocessed_dataframe(df_extracted)
    
    # Save preprocessed dataframe
    preprocessed_file = os.path.join(output_folder, "ceo_articles_preprocessed.csv")
    df_preprocessed.to_csv(preprocessed_file, index=False)
    print(f"Preprocessed data saved to: {preprocessed_file}")
    
    # Print final summary
    print("\n" + "="*80)
    print("PIPELINE COMPLETE - SUMMARY")
    print("="*80)
    print(f"Total CEOs processed: {df_preprocessed['CEO_Name'].nunique()}")
    print(f"Total articles in final dataset: {len(df_preprocessed)}")
    print(f"Sources in dataset: {df_preprocessed['Source'].nunique()}")
    print(f"Year range: {df_preprocessed['Year'].min()} - {df_preprocessed['Year'].max()}")
    
    # Show CEO article distribution
    print("\nArticles per CEO (top 10):")
    ceo_counts = df_preprocessed['CEO_Name'].value_counts().head(10)
    for ceo, count in ceo_counts.items():
        print(f"  {ceo}: {count}")
    
    return df_raw, df_extracted, df_preprocessed

# ===== USAGE =====

if __name__ == "__main__":
    # Set your main folder path
    main_folder_path = "/Users/shantanusharma/Desktop/Docs_for_analysis"  # The path where the zip files would be located
    
    # Run the pipeline
    df_raw, df_extracted, df_preprocessed = run_data_collection_pipeline(
        main_folder_path,
        output_folder= "ceo_analysis_output"
    )
