Preprocessing Documents

In [None]:
import pandas as pd
import spacy
import re
import string
from typing import List, Dict, Tuple, Set
from nltk.corpus import stopwords
import nltk # Added import for NLTK download check

# --- Setup: Load spaCy Model and Stopwords ---
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("SpaCy model 'en_core_web_lg' not found. Please run: python -m spacy download en_core_web_lg")
    exit()

try:
    nltk_stopwords = set(stopwords.words('english'))
except LookupError:
    print("NLTK stopwords not found. Downloading...")
    nltk.download('stopwords')
    nltk_stopwords = set(stopwords.words('english'))


# --- Helper Functions (No changes needed here) ---
def _is_token_related_to_target_person_as_subject_or_object(
    token, sentence_person_entities: List[spacy.tokens.Span], subject_deps: Set[str], object_deps: Set[str]
) -> bool:
    for child in token.children:
        if child.dep_ in subject_deps:
            for person_ent in sentence_person_entities:
                if child.i >= person_ent.start and child.i < person_ent.end:
                    return True
    for child in token.children:
        if child.dep_ in object_deps:
            for person_ent in sentence_person_entities:
                if child.i >= person_ent.start and child.i < person_ent.end:
                    return True
    return False

def _is_adjective_modifying_target_person(
    adj_token, sentence_person_entities: List[spacy.tokens.Span], adjectival_deps: Set[str]
) -> bool:
    if adj_token.dep_ in adjectival_deps:
        head_token = adj_token.head
        for person_ent in sentence_person_entities:
            if head_token.i >= person_ent.start and head_token.i < person_ent.end:
                return True
    return False

def _is_noun_describing_target_person(
    noun_token, sentence_person_entities: List[spacy.tokens.Span]
) -> bool:
    if noun_token.dep_ == "appos":
        head_token = noun_token.head
        for person_ent in sentence_person_entities:
            if head_token.i >= person_ent.start and head_token.i < person_ent.end:
                return True
    if noun_token.dep_ == "attr" and noun_token.head.pos_ == "VERB":
        verb_head = noun_token.head
        for child_of_verb in verb_head.children:
            if child_of_verb.dep_ in {"nsubj", "nsubjpass"}:
                for person_ent in sentence_person_entities:
                    if child_of_verb.i >= person_ent.start and child_of_verb.i < person_ent.end:
                        return True
    return False


# --- Main Preprocessing Function ---
# CHANGED: The function now accepts a DataFrame instead of a list of strings.
def run_preprocessing_pipeline(
    articles_df: pd.DataFrame,
    text_column: str,
    year_column: str,
    my_target_people: List[str],
    my_stopwords: Set[str],
    min_sentence_length: int = 1,
) -> pd.DataFrame:
    """
    Runs the full preprocessing pipeline on raw articles from a DataFrame.
    Returns a DataFrame containing original sentence, preprocessed sentence, and year.
    """
    print("--- Initial Sentence Extraction and Robust Deduplication ---")
    
    # CHANGED: Iterate through the DataFrame to access text and year together.
    all_raw_sentences_with_metadata = []
    for index, row in articles_df.iterrows():
        article_text = str(row[text_column])
        year = row[year_column] # Get the year from the same row.
        doc = nlp(article_text)
        for sent in doc.sents:
            # Store the sentence text and its associated year as a tuple.
            all_raw_sentences_with_metadata.append((sent.text, year))

    print(f"Total sentences extracted: {len(all_raw_sentences_with_metadata)}")

    # CHANGED: Deduplication now works with tuples (sentence, year).
    unique_cleaned_sentences_with_metadata = []
    seen_exact_fingerprints = set()
    
    for sent_text, year in all_raw_sentences_with_metadata:
        normalized_words_for_fingerprint = sorted([
            token.text.lower() for token in nlp(sent_text)
            if not token.is_punct and not token.is_digit and token.text.strip() != ""
        ])
        string_fingerprint = " ".join(normalized_words_for_fingerprint)

        if string_fingerprint in seen_exact_fingerprints:
            continue
        
        seen_exact_fingerprints.add(string_fingerprint)
        # Keep the sentence and its year together.
        unique_cleaned_sentences_with_metadata.append((sent_text.strip(), year))

    print(f"Total unique sentences after deduplication: {len(unique_cleaned_sentences_with_metadata)}")

    print("\nStarting detailed preprocessing with dependency parsing...")
    
    target_last_name_to_full_name = {name.split()[-1].lower(): name for name in my_target_people}
    adjectival_deps = {"amod"}
    subject_deps = {"nsubj", "nsubjpass"}
    object_deps = {"dobj", "pobj"}
    adverbial_deps = {"advmod"}
    temp_person_centric_sentences_data = []

    # CHANGED: Loop through unique sentences *and* their associated years.
    for raw_sentence_text, year in unique_cleaned_sentences_with_metadata:
        doc = nlp(raw_sentence_text)
        sentence = list(doc.sents)[0] 
        persons_in_this_sentence = set()
        sentence_person_entities = []

        for ent in sentence.ents:
            if ent.label_ == "PERSON":
                ent_last_name = ent.text.split()[-1].lower()
                if ent_last_name in target_last_name_to_full_name:
                    canonical_name = target_last_name_to_full_name[ent_last_name]
                    persons_in_this_sentence.add(canonical_name)
                    sentence_person_entities.append(ent)

        if not persons_in_this_sentence:
            continue

        processed_tokens = []
        # (Rest of the token processing logic is the same)
        for token in sentence:
            lemma = token.lemma_.lower()
            clean_lemma = re.sub(r'[^a-z]', '', lemma)
            if not clean_lemma or clean_lemma in my_stopwords or token.is_punct or token.is_digit:
                continue
            if token.pos_ == "ADJ" and _is_adjective_modifying_target_person(token, sentence_person_entities, adjectival_deps):
                processed_tokens.append(clean_lemma)
            elif token.pos_ == "VERB" and _is_token_related_to_target_person_as_subject_or_object(token, sentence_person_entities, subject_deps, object_deps):
                processed_tokens.append(clean_lemma)
            elif token.pos_ == "ADV" and token.dep_ in adverbial_deps:
                head_token = token.head
                if head_token.pos_ == "VERB" and _is_token_related_to_target_person_as_subject_or_object(head_token, sentence_person_entities, subject_deps, object_deps):
                    processed_tokens.append(clean_lemma)
                elif head_token.pos_ == "ADJ" and _is_adjective_modifying_target_person(head_token, sentence_person_entities, adjectival_deps):
                    processed_tokens.append(clean_lemma)
            elif token.pos_ in {"NOUN", "PROPN"} and _is_noun_describing_target_person(token, sentence_person_entities):
                processed_tokens.append(clean_lemma)

        if len(processed_tokens) >= min_sentence_length:
            preprocessed_sentence_str = " ".join(processed_tokens)
            for person_name in persons_in_this_sentence:
                # CHANGED: Add the 'Year' to the dictionary here.
                temp_person_centric_sentences_data.append({
                    'person_name': person_name,
                    'original_sentence': raw_sentence_text,
                    'preprocessed_tokens_str': preprocessed_sentence_str,
                    'Year': year 
                })
    
    final_preprocessed_data = []
    seen_final_original_sentences_for_df = set()

    for item in temp_person_centric_sentences_data:
        if item['original_sentence'] not in seen_final_original_sentences_for_df:
            seen_final_original_sentences_for_df.add(item['original_sentence'])
            final_preprocessed_data.append(item)

    df = pd.DataFrame(final_preprocessed_data)
    print("\nPreprocessing complete. Data prepared for export.")
    return df

if __name__ == "__main__":
    # --- Load your main DataFrame here ---
    # This assumes 'df_filtered' is already loaded and contains the necessary columns.
    # For demonstration, let's create a dummy df_filtered.
    # Replace this with loading your actual data, e.g., df_filtered = pd.read_csv('your_data.csv')

    # --- Configuration ---
    my_target_people = df_filtered['CEO_Name'].unique()
    my_stopwords = nltk_stopwords.union(custom_stopwords)

    # CHANGED: Call the function with the DataFrame and column names.
    preprocessed_df = run_preprocessing_pipeline(
        articles_df=df_filtered,
        text_column='extracted_text_unicode',
        year_column='Year',
        my_target_people=my_target_people,
        my_stopwords=my_stopwords,
    )

    # The resulting DataFrame now includes the 'Year' column.
    print("\nPreview of the final preprocessed DataFrame:")
    print(preprocessed_df.head())

    # Save the DataFrame for use in other code blocks
    preprocessed_df.to_csv('preprocessed_sentences_final.csv', index=False)
    print("\nPreprocessed sentences saved to 'preprocessed_sentences_final.csv'")

In [None]:
# 1. Import necessary libraries
import spacy
from spacy import displacy
import threading
import time

# Define the model name to be used.
# en_core_web_lg is a large English model with word vectors.
SPACY_MODEL = "en_core_web_lg"

# 2. Load the spaCy language model
# This block checks if the model is installed and downloads it if necessary.
# The model contains the vocabulary, syntax, and other data needed to process text.
print(f"Loading spaCy model '{SPACY_MODEL}'...")
try:
    nlp = spacy.load(SPACY_MODEL)
except OSError:
    print(f"Model '{SPACY_MODEL}' not found. Downloading...")
    # The spacy.cli.download function is used to download models from the command line,
    # but we can call it from Python as well.
    spacy.cli.download(SPACY_MODEL)
    nlp = spacy.load(SPACY_MODEL)
print("Model loaded successfully.")

# 3. The text to be analyzed
text = "The pandemic forced Mr. Woods to change direction."

# 4. Process the text with the spaCy NLP pipeline
# This creates a `Doc` object, which is a container for a sequence of tokens
# and all of their linguistic annotations (like part-of-speech tags and dependencies).
doc = nlp(text)

# 5. Define functions to start the servers
# We run each server in a separate thread so they can run at the same time.
def serve_dependency_parse():
    """Serves the dependency parse visualization."""
    print("\nStarting server for Dependency Parse (dep)...")
    # style='dep' is for dependency parsing
    displacy.serve(doc, style='dep', auto_select_port=True)

def serve_ner():
    """Serves the Named Entity Recognition visualization."""
    print("\nStarting server for Named Entity Recognition (ent)...")
    # style='ent' is for Named Entity Recognition
    displacy.serve(doc, style='ent', auto_select_port=True)

# 6. Create and start the threads
dep_thread = threading.Thread(target=serve_dependency_parse)
ner_thread = threading.Thread(target=serve_ner)

# Set threads as daemon so they will exit when the main program exits
dep_thread.daemon = True
ner_thread.daemon = True

dep_thread.start()
ner_thread.start()

# 7. Keep the main script running
print("\nTwo servers are starting on different ports.")
print("Copy the URLs from the console and open them in your browser.")
print("Stop the script with Ctrl+C in your terminal to shut down the servers.")

try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\nShutting down servers.")
