# Named Entity Recognition (NER) Pipeline
This notebook provides a complete pipeline for Named Entity Recognition (NER) on a collection of documents.
It includes steps for data preprocessing, entity extraction using spaCy, and basic visualization.

## Step 1: Import Libraries
We'll use libraries like spaCy for NER, pandas for data handling, and matplotlib for visualization.

In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Load spaCy's English NER model
nlp = spacy.load('en_core_web_sm')

# Load word documents

In [None]:
import os
from docx import Document
import pandas as pd

# Define file path manually
file_path = r"C:\Users\Rasmus\Downloads\Interview Generate Draft-20240229_163335-Emrah.docx"  # Update this path

def load_docx(file_path):
    doc = Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

# Load document and convert to DataFrame
text = load_docx(file_path)
df = pd.DataFrame({"filename": [os.path.basename(file_path)], "text": [text]})
df.head()

In [None]:
import os
import pandas as pd
from docx import Document

# Function to load text from a .docx file
def load_docx(file_path):
    """
    Load text from a Word document (.docx) and return as a string.
    """
    doc = Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

# Directory containing Word documents
directory_path = r"C:\Users\Rasmus\Projects\LitTools\docs"  # Update this path to your directory

# Load all .docx files from the directory into a DataFrame
documents = []
for filename in os.listdir(directory_path):
    if filename.endswith(".docx"):
        file_path = os.path.join(directory_path, filename)
        text = load_docx(file_path)
        documents.append({"filename": filename, "text": text})

# Convert to DataFrame
df = pd.DataFrame(documents)

# Display the DataFrame with all documents
pd.set_option('display.max_rows', None)  # Optional: to display all rows
pd.set_option('display.max_colwidth', None)  # Optional: to display full text
display(df)



## Step 3: Text Cleaning and Preprocessing
Clean and preprocess the text, removing unnecessary characters or whitespace.

In [None]:
def clean_text(text):
    # Basic text cleaning
    text = text.replace(' ', '').strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)  # Update 'text_column' with your actual column name
df.head()

## Step 4: Named Entity Recognition
Extract named entities using spaCy's NER model.

In [None]:
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['entities'] = df['cleaned_text'].apply(extract_entities)
df[['cleaned_text', 'entities']].head()

## Step 5: Analyze and Visualize Extracted Entities
Analyze the extracted entities, such as entity frequency or types.

In [None]:
# Flatten list of entities
all_entities = [ent for entities in df['entities'] for ent in entities]
entity_texts = [ent[0] for ent in all_entities]
entity_labels = [ent[1] for ent in all_entities]

# Count entities
entity_counts = Counter(entity_labels)
print(entity_counts)

### Visualization: Entity Frequency
Visualize the frequency of entity types extracted.

In [None]:
# Plot entity frequency
plt.figure(figsize=(10, 6))
plt.bar(entity_counts.keys(), entity_counts.values())
plt.title('Entity Frequency')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.show()

## Step 6: Save Extracted Entities
Save the extracted entities for further analysis or use.

In [None]:
# Save to CSV
df["entities"].to_csv('extracted_entities.csv', index=False)  

In [None]:
# Function to extract entities, excluding CARDINAL
def extract_entities_excluding_cardinals(text):
    doc = nlp(text)
    # Only include entities that are not labeled as CARDINAL
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ != "CARDINAL"]

# Apply the function to each row in the DataFrame
df['filtered_entities'] = df['text'].apply(extract_entities_excluding_cardinals)

# Display the DataFrame with the filtered entities
df[[ 'filtered_entities']]

In [None]:
# Flatten list of entities
all_entities = [ent for entities in df['filtered_entities'] for ent in entities]
entity_texts = [ent[0] for ent in all_entities]
entity_labels = [ent[1] for ent in all_entities]

# Count entities
entity_counts = Counter(entity_labels)
print(entity_counts)

In [None]:
# Plot entity frequency
plt.figure(figsize=(15, 6))
plt.bar(entity_counts.keys(), entity_counts.values())
plt.title('Entity Frequency')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.show()

In [None]:
df["filtered_entities"].to_csv("filtered.csv")

In [None]:
entity_types_to_keep =  {"PERSON"}

# Function to extract entities of certain types
def extract_specific_entities(text):
    doc = nlp(text)
    # Only include entities of the specified types
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in entity_types_to_keep]

# Apply the function to each row in the DataFrame
df['filtered_entities'] = df['text'].apply(extract_specific_entities)

# Display the DataFrame with the filtered entities
df['filtered_entities'].to_csv("person.csv")

In [None]:
df["cleaned_text"]

In [None]:
import pandas as pd
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations

# Load spaCy's English NER model
nlp = spacy.load('en_core_web_sm')

# Assume you have a DataFrame named `df` with a column called "text"
# df = pd.read_csv("your_documents.csv") # Uncomment if you are reading from a CSV

# Extract entities and create pairs of related entities
entity_pairs = []

def extract_entities_and_pairs(text):
    doc = nlp(text)
    # Extract PERSON, ORG, and GPE entities
    entities = [ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "PRODUCT"}]
    
    # Create pairs of entities that co-occur in the same sentence
    for sent in doc.sents:
        sent_entities = [ent.text for ent in sent.ents if ent.label_ in {"PERSON", "ORG", "GPE", "PRODUCT"}]
        entity_pairs.extend(list(combinations(sent_entities, 2)))

# Apply the extraction to all rows in the DataFrame
df["text"].apply(extract_entities_and_pairs)

# Create a DataFrame from entity pairs to store relationships
edges_df = pd.DataFrame(entity_pairs, columns=["Entity1", "Entity2"])

# Create a graph using NetworkX
G = nx.Graph()

# Add edges to the graph
for _, row in edges_df.iterrows():
    G.add_edge(row['Entity1'], row['Entity2'])

# Draw the social network graph
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(G, k=0.5)  # Positioning the nodes for visualization
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=1500, font_size=10, font_weight='bold', edge_color='gray')
plt.title('Social Network Graph of Entities')
plt.show()


In [None]:
# Load spaCy's English NER model
nlp = spacy.load('en_core_web_sm')

# Assume you have a DataFrame named `df` with a column called "text"
# df = pd.read_csv("your_documents.csv") # Uncomment if you are reading from a CSV

# Extract entities from each document
document_entities = []

def extract_entities(text):
    doc = nlp(text)
    # Extract PERSON, ORG, and GPE entities
    entities = [ent.text for ent in doc.ents if ent.label_ in {"PERSON", "PRODUCT"}]
    return entities

# Apply entity extraction to each document
df['entities'] = df['text'].apply(extract_entities)

# Create pairs of related entities within the same document
entity_pairs = []

for entities in df['entities']:
    # Create pairs within the same document
    entity_pairs.extend(list(combinations(entities, 2)))

# Create pairs across different documents
for i, entities_i in enumerate(df['entities']):
    for j, entities_j in enumerate(df['entities']):
        if i < j:  # To avoid self-pairs and duplicate pairs
            # Create combinations of entities across documents
            for entity_i in entities_i:
                for entity_j in entities_j:
                    if entity_i != entity_j:
                        entity_pairs.append((entity_i, entity_j))

# Create a DataFrame from entity pairs to store relationships
edges_df = pd.DataFrame(entity_pairs, columns=["Entity1", "Entity2"])

# Create a graph using NetworkX
G = nx.Graph()

# Add edges to the graph
for _, row in edges_df.iterrows():
    G.add_edge(row['Entity1'], row['Entity2'])

# Draw the social network graph
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(G, k=0.5)  # Positioning the nodes for visualization
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=1500, font_size=10, font_weight='bold', edge_color='gray')
plt.title('Social Network Graph of Entities (Within and Across Documents)')
plt.show()