In [None]:
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.language import Language

# Load data
df = pd.read_csv('OR Crook 3-9.99.xlsx - Sheet1.csv')

# Data cleaning
df.dropna(subset=['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP','OWNER_NAME_1'], inplace=True)
df = df.drop_duplicates(['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP'])

# Remove commas and strip whitespace
df['OWNER_NAME_1'] = df['OWNER_NAME_1'].str.replace(',', '').str.strip()

# List of keywords indicating company names
Suspected_Words = [
    'OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE',
    'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE',
    'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS',
    'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION',
    'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT',
    'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES',
    'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT',
    'NORTHEAST', 'PLUMBING', 'HEATING'
]

# Pattern to detect suspected company names
pattern = r'\b(' + '|'.join(Suspected_Words) + r')\b'

# Filter out suspected company names
df = df[~df['OWNER_NAME_1'].str.contains(pattern, case=False, na=False)]

# Load spaCy's pre-trained model for English
nlp = spacy.load("en_core_web_sm")

# Define custom NER component
@Language.component("custom_ner")
def custom_ner(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "ORG":
            new_ents.append(Span(doc, ent.start, ent.end, label="COMPANY"))
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add custom NER component to the pipeline
nlp.add_pipe("custom_ner", last=True)

# Define a function to classify names
def classify_name(name):
    try:
        doc = nlp(name)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                return "Human Name"
            elif ent.label_ == "COMPANY":
                return "Company Name"
    except Exception as e:
        print(f"Error processing name '{name}': {e}")
    return "Unknown"

# Apply the classification function to the "OWNER_NAME_1" column
df["Name Type"] = df["OWNER_NAME_1"].apply(classify_name)

# Save the results to a new CSV
df.to_csv('Removed_keywords_datasheet.csv', index=False)
print("Classification complete. Results saved to 'Removed_keywords_datasheet.csv'.")


# The rest of this notebook is still not done and ***Can't be Used***

In [None]:
df['OWNER_NAME_1'][df['Name Type'] == 'Company Name']

In [None]:
# Load data
df = pd.read_csv('OR Crook 3-9.99.xlsx - Sheet1.csv')
df.dropna(subset=['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP','OWNER_NAME_1'], inplace=True)
df = df.drop_duplicates(['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP'])

df['OWNER_NAME_1'] = df['OWNER_NAME_1'].replace(',', '')

In [None]:
df['OWNER_NAME_1'].apply(lambda x: x.replace(',', ''))

In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
import random
import os

def load_training_data(file_path, sheet_name='Training Data'):
    """
    Load training data from an Excel file and prepare it for training.

    Args:
        file_path (str): Path to the Excel file.
        sheet_name (str): Sheet name in the Excel file containing the training data.

    Returns:
        list: A list of training examples formatted for spaCy.
    """
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    train_data = []
    for _, row in df.iterrows():
        text = row['Full Name']
        label = row['Name Type']
        if label == "Human Name":
            entities = [(0, len(text), "PERSON")]
        elif label == "Company Name":
            entities = [(0, len(text), "ORG")]
        train_data.append((text, {"entities": entities}))
    return train_data

def train_model(nlp, train_data, val_data, n_iter=10, dropout=0.5):
    """
    Train the NER model.

    Args:
        nlp (Language): The spaCy language model.
        train_data (list): List of training data.
        val_data (list): List of validation data.
        n_iter (int): Number of training iterations.
        dropout (float): Dropout rate.

    """
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")
            evaluate_model(nlp, val_data)

def evaluate_model(nlp, val_data):
    """
    Evaluate the NER model on validation data.

    Args:
        nlp (Language): The spaCy language model.
        val_data (list): List of validation data.
    """
    correct = 0
    total = 0
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        if ents == true_ents:
            correct += 1
        total += 1
    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Main script
if __name__ == "__main__":
    # Load and prepare training data
    train_data = load_training_data('Training Data Scrubbing Names.xlsx')
    # Remove commas and strip whitespace
    df['Full Name'] = df['Full Name'].str.replace(',', '').str.strip()
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

    # Load pre-trained spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Add NER pipe if not present
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")

    # Add new labels
    ner.add_label("PERSON")
    ner.add_label("ORG")

    # Train the model
    train_model(nlp, train_data, val_data, n_iter=20)

    # Save the fine-tuned model
    model_directory = "path/to/save/custom_ner_model"
    os.makedirs(model_directory, exist_ok=True)
    nlp.to_disk(model_directory)


- Saving the model


In [None]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
import random
import os

# Your training and model preparation code here...

# Directory to save the model
model_directory = "custom_ner_model"

# Train the model (assuming the train_model function from previous example)
train_model(nlp, train_data, val_data, n_iter=20)

# Ensure the directory exists
os.makedirs(model_directory, exist_ok=True)

# Save the fine-tuned model
nlp.to_disk(model_directory)


- Loading the model

In [None]:
import spacy

# Load the fine-tuned model
model_directory = "path/to/save/custom_ner_model"
nlp = spacy.load(model_directory)

# Define a function to classify names
def classify_name(name):
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return "Human Name"
        elif ent.label_ == "ORG":
            return "Company Name"
    return "Unknown"

# Test the model with new data
test_names = ["John Doe", "Tech Innovators LLC", "Acme Corporation"]
for name in test_names:
    name_type = classify_name(name)
    print(f"{name}: {name_type}")


In [1]:
import pandas as pd

df = pd.read_excel("Training Data Scrubbing Names.xlsx")

df['Name Type'].value_counts(normalize=True)

Name Type
Human Name      0.87619
Company Name    0.12381
Name: proportion, dtype: float64