In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
import random
import os

def load_training_data(file_path):
    """
    Load training data from an Excel file and prepare it for training.

    Args:
        file_path (str): Path to the Excel file.
        sheet_name (str): Sheet name in the Excel file containing the training data.

    Returns:
        list: A list of training examples formatted for spaCy.
    """
    df = pd.read_excel(file_path)
    # Remove commas and strip whitespace
    df['Full Name'] = df['Full Name'].str.replace(',', '').str.strip()
    train_data = []
    for _, row in df.iterrows():
        text = row['Full Name']
        label = row['Name Type']
        if label == "Human Name":
            entities = [(0, len(text), "PERSON")]
        elif label == "Company Name":
            entities = [(0, len(text), "ORG")]
        train_data.append((text, {"entities": entities}))
    return train_data

def train_model(nlp, train_data, val_data, n_iter=10, dropout=0.5):
    """
    Train the NER model.

    Args:
        nlp (Language): The spaCy language model.
        train_data (list): List of training data.
        val_data (list): List of validation data.
        n_iter (int): Number of training iterations.
        dropout (float): Dropout rate.

    """
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")
            evaluate_model(nlp, val_data)

def evaluate_model(nlp, val_data):
    """
    Evaluate the NER model on validation data.

    Args:
        nlp (Language): The spaCy language model.
        val_data (list): List of validation data.
    """
    correct = 0
    total = 0
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        if ents == true_ents:
            correct += 1
        total += 1
    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Main script
if __name__ == "__main__":
    # Load and prepare training data
    train_data = load_training_data('Training Data Scrubbing Names.xlsx')
    
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

    # Load pre-trained spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Add NER pipe if not present
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")

    # Add new labels
    ner.add_label("PERSON")
    ner.add_label("ORG")

    # Train the model
    train_model(nlp, train_data, val_data, n_iter=20)

    # Save the fine-tuned model
    model_directory = "custom_ner_model"
    os.makedirs(model_directory, exist_ok=True)
    nlp.to_disk(model_directory)


- Handling Imbalanced Data Using Under-Sampling

In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from spacy.language import Language
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import random

# Load the training data
df = pd.read_excel('Training Data Scrubbing Names.xlsx')
# Remove commas and strip whitespace
df['Full Name'] = df['Full Name'].str.replace(',', '').str.strip()
df['Full Name'] = df['Full Name'].str.replace('&', '').str.strip()

# # Separate majority and minority classes
# df_human = df[df['Name Type'] == 'Human Name']
# df_company = df[df['Name Type'] == 'Company Name']

# # Under-sample the majority class
# df_human_under = resample(df_human,
#                           replace=False,  # sample without replacement
#                           n_samples=len(df_company),  # match minority class
#                           random_state=42)  # reproducible results

# # Combine minority class with downsampled majority class
# df_balanced = pd.concat([df_human_under, df_company])

# # Shuffle the dataset
# df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

# Prepare training data
TRAIN_DATA = []
for _, row in df.iterrows():
    text = row['Full Name']
    label = row['Name Type']
    if label == "Human Name":
        entities = [(0, len(text), "PERSON")]
    elif label == "Company Name":
        entities = [(0, len(text), "ORG")]
    TRAIN_DATA.append((text, {"entities": entities}))

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.3, random_state=42, shuffle=True)

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Add a new pipe to the model (if needed)
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
ner.add_label("PERSON")
ner.add_label("ORG")

# Disable other pipes during training
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

def train_model(nlp, train_data, val_data, n_iter=10, dropout=0.3):
    """Train the NER model."""
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        for itn in range(n_iter):  # number of iterations
            random.shuffle(train_data)
            losses = {}
            # Batch the examples and shuffle
            batches = minibatch(train_data, size=compounding(4.0, 64.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")
            evaluate_model(nlp, val_data)  # Evaluate on validation set

def evaluate_model(nlp, val_data):
    """Evaluate the model on validation data."""
    correct = 0
    total = 0
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        if ents == true_ents:
            correct += 1
        total += 1
    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Train the model
train_model(nlp, train_data, val_data, n_iter=20)

# Save the fine-tuned model
nlp.to_disk("custom_ner_model")


- Simple Testing

In [None]:
import spacy

# Load the fine-tuned model
model_directory = "custom_ner_model"
nlp = spacy.load(model_directory)

# Define a function to classify names
def classify_name(name):
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return "Human Name"
        elif ent.label_ == "ORG":
            return "Company Name"
    return "Unknown"

# Test the model with new data
test_names = ["Brown ASSOC", "Tech Innovators LLC", "Acme Corporation", "Miller FOR", 'Martinez  COUNTRY CLUB']
for name in test_names:
    name_type = classify_name(name)
    print(f"{name}: {name_type}")


- Enhanced Model

In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import random

# Load the training data
df = pd.read_excel('Training Data Scrubbing Names.xlsx')
# Remove commas and strip whitespace
df['Full Name'] = df['Full Name'].str.replace(',', '').str.strip()
df['Full Name'] = df['Full Name'].str.replace('&', '').str.strip()

# Prepare training data
TRAIN_DATA = []
for _, row in df.iterrows():
    text = row['Full Name']
    label = row['Name Type']
    if label == "Human Name":
        entities = [(0, len(text), "PERSON")]
    elif label == "Company Name":
        entities = [(0, len(text), "ORG")]
    TRAIN_DATA.append((text, {"entities": entities}))

# Handle class imbalance with under-sampling
def balance_classes(data, ratio=1.0):
    human_names = [item for item in data if item[1]['entities'][0][2] == "PERSON"]
    company_names = [item for item in data if item[1]['entities'][0][2] == "ORG"]
    if len(human_names) > len(company_names):
        human_names = random.sample(human_names, int(len(company_names) * ratio))
    else:
        company_names = random.sample(company_names, int(len(human_names) / ratio))
    return human_names + company_names

# Balance and shuffle the training data
# TRAIN_DATA = balance_classes(TRAIN_DATA)
random.shuffle(TRAIN_DATA)

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.3, random_state=42)

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Add or get the NER component in the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
ner.add_label("PERSON")
ner.add_label("ORG")

# Disable other pipes during training
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Data augmentation function
def augment_data(data, multiplier=1.2):
    augmented_data = []
    for text, annotations in data:
        augmented_data.append((text, annotations))
        if annotations['entities'][0][2] == "PERSON":
            augmented_data.append((text.lower(), annotations))
        elif annotations['entities'][0][2] == "ORG":
            augmented_data.append((text.upper(), annotations))
    random.shuffle(augmented_data)
    return augmented_data[:int(len(augmented_data) * multiplier)]

# Augment the training data
train_data = augment_data(train_data)

# Training function with logging and early stopping
def train_model(nlp, train_data, val_data, n_iter=30, dropout=0.5):
    best_f1 = 0
    no_improve_epochs = 0
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses)
            val_f1 = evaluate_model(nlp, val_data)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}, Validation F1-Score: {val_f1:.2f}")

            if val_f1 > best_f1:
                best_f1 = val_f1
                no_improve_epochs = 0
            else:
                no_improve_epochs += 1

            if no_improve_epochs > 5:  # No improvement for 5 epochs
                print("Early stopping triggered")
                break

# Evaluation function using F1 score
def evaluate_model(nlp, val_data):
    y_true = []
    y_pred = []
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        y_true.append(true_ents[0][2])
        y_pred.append(ents[0][2] if ents else "O")
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"Classification Report:\n{classification_report(y_true, y_pred, labels=['PERSON', 'ORG'])}")
    return f1

# Train the model
train_model(nlp, train_data, val_data, n_iter=20, dropout=0.3)

# Save the fine-tuned model
nlp.to_disk("custom_ner_model")


- Testing the Model

In [None]:
import random

# Generating synthetic data for testing
def generate_synthetic_data(num_records=200):
    # Lists of common human first and last names and company keywords
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    
    # List of keywords indicating company names
    company_keywords = ['OF', 'SERVICES','COUNTY','DISTRIBUTOR','PRODUCTS',' COUNTRY CLUB','PLLC','PRIVATE''COMPANY', 'INC','RETIREMENT','DEVELOPMENT','HOA','AUTHORITY','CONF','CONFERENCE','CONSTRCTN','AFFORDABLE','HSNG','MID COAST','ESTATE','REHABILITATION','GARDENS','WELLNESS'
                    'LLC', 'STATE', 'LAND','MEMBERSHIP','COOPERATIVE' 'CORP','CORPORATION','INDEPENDENT','ARCARE','CHURCH','ENTERPRISES','ACCOUNTING','INVESTMENTS','OWNERS','AIRPORT','MAINTENANCE','MOTORCYCLE','ASSOC','EXCAVATING','EXCAVATION','PROPS','RECREATIONAL','VILLAGE','FOR','GROUND','REAL'
                    'CO', 'REALTY', 'CONSTRUCTION', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES', 'CORPORATE','DISTRICT','LTD','LIMITED','INCORPORATED','PROPERTIES','INVESTMENT','ASSN','TRAILS','NORTHEAST','ESTS','RIVERGATE','PLUMBING','HEATING']

    data = []

    for _ in range(num_records):
        # Randomly decide to create a human name or company name
        if random.random() < 0.5:
            # Create a human name
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            # Create a company name
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})

    return data

# Generate 200 records
synthetic_data = generate_synthetic_data(200)


import spacy

# Load the fine-tuned model
nlp = spacy.load("custom_ner_model")

# Function to classify names using the model
def classify_name(name):
    doc = nlp(name)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return "Human Name"
        elif ent.label_ == "COMPANY":
            return "Company Name"
    return "Unknown"

# Evaluate the model on synthetic data
correct = 0
total = len(synthetic_data)

for record in synthetic_data:
    predicted = classify_name(record['Full Name'])
    actual = record['Name Type']
    if predicted == actual:
        correct += 1

accuracy = correct / total * 100
print(f"Model Accuracy on Synthetic Data: {accuracy:.2f}%")


In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

# Load the training data
df = pd.read_excel('Training Data Scrubbing Names 2.xlsx')

# Prepare training data
TRAIN_DATA = []
for _, row in df.iterrows():
    text = row['Full Name']
    label = row['Name Type']
    if label == "Human Name":
        entities = [(0, len(text), "PERSON")]
    elif label == "Company Name":
        entities = [(0, len(text), "ORG")]
    TRAIN_DATA.append((text, {"entities": entities}))

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

# Load a pre-trained spaCy model
nlp = spacy.load("en_core_web_md")  # Use 'en_core_web_lg' for more robust embeddings if needed

# Check if NER pipe exists, else create it
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
ner.add_label("PERSON")
ner.add_label("ORG")

# Disable other pipes during training
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Augmentation Function
def augment_data(data):
    augmented_data = []
    for text, annotations in data:
        entities = annotations['entities']
        # Simple augmentation: case variations and minor spelling changes
        if entities[0][2] == "PERSON":
            augmented_texts = [text.upper(), text.lower(), text[::-1]]  # Reverse string as a naive augmentation
        elif entities[0][2] == "ORG":
            augmented_texts = [text.replace("Inc", "Inc."), text.replace("LLC", "L.L.C.")]
        for aug_text in augmented_texts:
            augmented_data.append((aug_text, annotations))
    return data + augmented_data

# Augment the training data
train_data = augment_data(train_data)

# Custom training function with additional strategies
def train_model(nlp, train_data, val_data, n_iter=20, dropout=0.3):
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        optimizer.learn_rate = 0.001  # Hyperparameter tuning
        for itn in range(n_iter):  # number of iterations
            random.shuffle(train_data)
            losses = {}
            # Batch the examples and shuffle
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")
            evaluate_model(nlp, val_data)  # Evaluate on validation set

# Evaluation function with classification report
def evaluate_model(nlp, val_data):
    y_true = []
    y_pred = []
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        y_true.append(true_ents[0][2] if true_ents else "None")
        y_pred.append(ents[0][2] if ents else "None")
    print(classification_report(y_true, y_pred))

# Train the model
train_model(nlp, train_data, val_data, n_iter=20)

# Save the fine-tuned model
nlp.to_disk("custom_ner_model")


In [None]:
# Ensemble Method: Example of using the same model with different initializations (optional)
def ensemble_predict(text, models):
    preds = []
    for model in models:
        doc = model(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        preds.append(ents[0][2] if ents else "None")
    # Majority voting
    final_pred = max(set(preds), key=preds.count)
    return final_pred

# Load models and test ensemble (if multiple models are available)
models = [nlp]  # Extend this list with other model paths if needed
text = "John Steve"
predicted_label = ensemble_predict(text, models)
print(f"Ensemble predicted label: {predicted_label}")

- Ensemble Method

In [None]:
import pandas as pd
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import classification_report

# Load the training data
df = pd.read_excel('Training Data Scrubbing Names 2.xlsx')

# Prepare training data
TRAIN_DATA = []
for _, row in df.iterrows():
    text = row['Full Name']
    label = row['Name Type']
    if label == "Human Name":
        entities = [(0, len(text), "PERSON")]
    elif label == "Company Name":
        entities = [(0, len(text), "ORG")]
    TRAIN_DATA.append((text, {"entities": entities}))

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

# Load spaCy models
models = {
    "sm": spacy.load("en_core_web_sm"),
    "md": spacy.load("en_core_web_md"),
    "lg": spacy.load("en_core_web_lg")
}

# Add a new pipe to each model if needed and add labels
for model_key in models:
    nlp = models[model_key]
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    ner.add_label("PERSON")
    ner.add_label("ORG")

# Custom training function with additional strategies
def train_model(nlp, train_data, val_data, n_iter=20, dropout=0.3):
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.resume_training()
        optimizer.learn_rate = 0.001  # Hyperparameter tuning
        for itn in range(n_iter):  # number of iterations
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    examples.append(example)
                nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
            print(f"Iteration {itn + 1}/{n_iter}, Losses: {losses}")
            evaluate_model(nlp, val_data)  # Evaluate on validation set

# Evaluation function with classification report
def evaluate_model(nlp, val_data):
    y_true = []
    y_pred = []
    for text, annotations in val_data:
        doc = nlp(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_ents = annotations['entities']
        y_true.append(true_ents[0][2] if true_ents else "None")
        y_pred.append(ents[0][2] if ents else "None")
    print(classification_report(y_true, y_pred))

# Train each model
for model_key in models:
    print(f"Training model: {model_key}")
    train_model(models[model_key], train_data, val_data, n_iter=20)

# Save the fine-tuned models
for model_key, model in models.items():
    model.to_disk(f"{model_key}_ner_model")

# Ensemble prediction method
def ensemble_predict(text, models):
    preds = []
    for model in models.values():
        doc = model(text)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        preds.append(ents[0][2] if ents else "None")
    final_pred = max(set(preds), key=preds.count)
    return final_pred

# Test ensemble method
test_text = "John Doe"
predicted_label = ensemble_predict(test_text, models)
print(f"Ensemble predicted label: {predicted_label}")


In [None]:
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

import pandas as pd

# Load the dataset
file_path = 'Training Data Scrubbing Names 2.xlsx'
data = pd.read_excel(file_path, sheet_name=None)

# Convert dict_keys to a list to access the first sheet
sheet_names = list(data.keys())
sheet_names, data[sheet_names[0]].head()

# Load spaCy modelsz
nlp_lg = spacy.load('en_core_web_lg')
nlp_md = spacy.load('en_core_web_md')
nlp_sm = spacy.load('en_core_web_sm')

# Function to extract features using spaCy models
def extract_features(text, model):
    doc = model(text)
    return doc.vector

# Extracting features for each spaCy model
data = data[sheet_names[0]]
X_lg = np.array([extract_features(name, nlp_lg) for name in data['Full Name']])
X_md = np.array([extract_features(name, nlp_md) for name in data['Full Name']])
X_sm = np.array([extract_features(name, nlp_sm) for name in data['Full Name']])

# Target variable
y = data['Name Type'].apply(lambda x: 1 if x == 'Human Name' else 0).values

# Train-test split
X_train_lg, X_test_lg, y_train, y_test = train_test_split(X_lg, y, test_size=0.2, random_state=42)
X_train_md, X_test_md = train_test_split(X_md, test_size=0.2, random_state=42)
X_train_sm, X_test_sm = train_test_split(X_sm, test_size=0.2, random_state=42)

# Model training
clf_lg = RandomForestClassifier(random_state=42)
clf_md = RandomForestClassifier(random_state=42)
clf_sm = RandomForestClassifier(random_state=42)

clf_lg.fit(X_train_lg, y_train)
clf_md.fit(X_train_md, y_train)
clf_sm.fit(X_train_sm, y_train)

# Predictions
pred_lg = clf_lg.predict(X_test_lg)
pred_md = clf_md.predict(X_test_md)
pred_sm = clf_sm.predict(X_test_sm)

# Ensemble: Averaging predictions
ensemble_pred = (pred_lg + pred_md + pred_sm) / 3
ensemble_pred = np.round(ensemble_pred).astype(int)

# Accuracy
accuracy_lg = accuracy_score(y_test, pred_lg)
accuracy_md = accuracy_score(y_test, pred_md)
accuracy_sm = accuracy_score(y_test, pred_sm)
accuracy_ensemble = accuracy_score(y_test, ensemble_pred)

accuracy_lg, accuracy_md, accuracy_sm, accuracy_ensemble



In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Read the CSV file into a DataFrame
df = pd.read_excel('Training Data Scrubbing Names 2.xlsx')

# Display the first 5 rows
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

# Drop null values in `Full Name`
df.dropna(subset=['Full Name'], inplace=True)

# Convert to categorical
df['Name Type'] = df['Name Type'].astype('category')

# Create a copy of the dataframe for feature engineering
df_model = df.copy()

# Create new features
df_model['name_length'] = df_model['Full Name'].apply(len)
df_model['has_title'] = df_model['Full Name'].str.contains('Mr\.|Ms\.|Dr\.', regex=True).astype(int)
df_model['has_suffix'] = df_model['Full Name'].str.contains('Inc\.|LLC|Corp', regex=True).astype(int)
df_model['num_words'] = df_model['Full Name'].str.split().apply(len)

# Split into training and testing sets
X = df_model[['name_length', 'has_title', 'has_suffix', 'num_words']]
y = df_model['Name Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42)
}

# Cross-validation and training
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: Mean CV Score: {np.mean(cv_scores):.4f}, Std Dev: {np.std(cv_scores):.4f}")

    # Fit on the entire training set
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate and print test accuracy
    test_accuracy = (y_pred == y_test).mean()
    print(f"{name}: Test Accuracy: {test_accuracy:.4f}\n")

# Determine the best model
best_model = max(models, key=lambda name: models[name].score(X_test, y_test))
best_accuracy = models[best_model].score(X_test, y_test)

if all(models[model].score(X_test, y_test) == best_accuracy for model in models):
    print("No clear best model. Further analysis or model tuning may be needed.")
else:
    print(f"The best model is {best_model} with a test accuracy of {best_accuracy:.4f}")


| Full Name      | Name Type   |
|:---------------|:------------|
| MAIZE, MIKE    | Human Name  |
| ANDREWS JR     | Human Name  |
| REEL, STEVEN   | Human Name  |
| GOFF, CODY     | Human Name  |
| BOMAR, MICHAEL | Human Name  |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30203 entries, 0 to 30202
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Full Name  30203 non-null  object
 1   Name Type  30203 non-null  object
dtypes: object(2)
memory usage: 472.1+ KB
None
Logistic Regression: Mean CV Score: 0.7521, Std Dev: 0.0055
Logistic Regression: Test Accuracy: 0.7560

Random Forest: Mean CV Score: 0.8027, Std Dev: 0.0048
Random Forest: Test Accuracy: 0.8091

SVC: Mean CV Score: 0.7930, Std Dev: 0.0045
SVC: Test Accuracy: 0.7927

The best model is Random Forest with a test accuracy of 0.8091
