In [None]:
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.language import Language

# Load data
df = pd.read_csv('OR Crook 3-9.99.xlsx - Sheet1.csv')

# Data cleaning
df.dropna(subset=['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP','OWNER_NAME_1'], inplace=True)
df = df.drop_duplicates(['MAIL_ADDR','MAIL_CITY','MAIL_STATE','MAIL_ZIP'])

# Remove commas and strip whitespace
df['OWNER_NAME_1'] = df['OWNER_NAME_1'].str.replace(',', '').str.strip()

# List of keywords indicating company names
Suspected_Words = [
    'OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE',
    'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE',
    'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS',
    'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION',
    'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT',
    'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES',
    'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT',
    'NORTHEAST', 'PLUMBING', 'HEATING', 'TRUST'
]

# Pattern to detect suspected company names
pattern = r'\b(' + '|'.join(Suspected_Words) + r')\b'

# Filter out suspected company names
df = df[~df['OWNER_NAME_1'].str.contains(pattern, case=False, na=False)]

# Load spaCy's pre-trained model for English
nlp = spacy.load("en_core_web_sm")

# Define custom NER component
@Language.component("custom_ner")
def custom_ner(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "ORG":
            new_ents.append(Span(doc, ent.start, ent.end, label="COMPANY"))
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add custom NER component to the pipeline
nlp.add_pipe("custom_ner", last=True)

# Define a function to classify names
def classify_name(name):
    try:
        doc = nlp(name)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                return "Human Name"
            elif ent.label_ == "COMPANY":
                return "Company Name"
    except Exception as e:
        print(f"Error processing name '{name}': {e}")
    return "Unknown"

# Apply the classification function to the "OWNER_NAME_1" column
df["Name Type"] = df["OWNER_NAME_1"].apply(classify_name)

# Save the results to a new CSV
df.to_csv('Removed_keywords_datasheet.csv', index=False)
print("Classification complete. Results saved to 'Removed_keywords_datasheet.csv'.")


# The rest of this notebook is still not done and ***Can't be Used***

In [71]:
import random
random.seed(42)
# Generating synthetic data for testing
def generate_synthetic_data(num_records=500):
    # Lists of common human first and last names and company keywords
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    
    # List of keywords indicating company names
    company_keywords = ['OF', 'SERVICES','COUNTY','DISTRIBUTOR','PRODUCTS',' COUNTRY CLUB','PLLC','PRIVATE''COMPANY', 'INC','RETIREMENT','DEVELOPMENT','HOA','AUTHORITY','CONF','CONFERENCE','CONSTRCTN','AFFORDABLE','HSNG','MID COAST','ESTATE','REHABILITATION','GARDENS','WELLNESS'
                    'LLC', 'STATE', 'LAND','MEMBERSHIP','COOPERATIVE' 'CORP','CORPORATION','INDEPENDENT','ARCARE','CHURCH','ENTERPRISES','ACCOUNTING','INVESTMENTS','OWNERS','AIRPORT','MAINTENANCE','MOTORCYCLE','ASSOC','EXCAVATING','EXCAVATION','PROPS','RECREATIONAL','VILLAGE','FOR','GROUND','REAL'
                    'CO', 'REALTY', 'CONSTRUCTION', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES', 'CORPORATE','DISTRICT','LTD','LIMITED','INCORPORATED','PROPERTIES','INVESTMENT','ASSN','TRAILS','NORTHEAST','ESTS','RIVERGATE','PLUMBING','HEATING']

    data = []

    for _ in range(num_records):
        # Randomly decide to create a human name or company name
        if random.random() < 0.5:
            # Create a human name
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            # Create a company name
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})

    return data

# Interpretation:
- High Precision for Human Names: The model is very precise in identifying human names, with few false positives.
- High Recall for Company Names: The model correctly identifies most company names, missing only a small percentage.
- Lower Recall for Human Names: The model is less effective at identifying all human names, as indicated by the lower recall.

- ***Overall, the model performs well with a balanced F1-score of 0.92 for both classes. However, it is slightly better at classifying company names (higher recall) than human names. Further tuning, such as optimizing the decision threshold or incorporating more features, could improve performance.***

In [20]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import random
import numpy as np

# Ensure necessary NLTK data files are downloaded
nltk.download('punkt')

# Generate synthetic data for testing
def generate_synthetic_data(num_records=500):
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    company_keywords = ['OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE', 
                        'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE', 
                        'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS', 
                        'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION', 
                        'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT', 
                        'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES', 
                        'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT', 
                        'NORTHEAST', 'PLUMBING', 'HEATING', 'TRUST']

    data = []
    for _ in range(num_records):
        if random.random() < 0.5:
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return data

# Load and preprocess data
df = pd.read_excel("Training Data Scrubbing Names 2.xlsx")

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000, penalty='l2', C=1.0, solver='liblinear')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))



# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(500)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ak758\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model Accuracy on Test Set: 98.99%
              precision    recall  f1-score   support

Company Name       0.99      0.99      0.99      2873
  Human Name       0.99      0.99      0.99      3168

    accuracy                           0.99      6041
   macro avg       0.99      0.99      0.99      6041
weighted avg       0.99      0.99      0.99      6041

[[2835   38]
 [  23 3145]]
Accuracy on Synthetic Data: 92.20%
              precision    recall  f1-score   support

Company Name       0.89      0.96      0.92       249
  Human Name       0.96      0.88      0.92       251

    accuracy                           0.92       500
   macro avg       0.92      0.92      0.92       500
weighted avg       0.93      0.92      0.92       500

[[240   9]
 [ 30 221]]


- Applying Shuffling

In [64]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import random
import numpy as np
random.seed(42)

# Ensure necessary NLTK data files are downloaded
nltk.download('punkt')

# Generate synthetic data for testing
def generate_synthetic_data(num_records=1000):
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    company_keywords = ['OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE', 
                        'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE', 
                        'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS', 
                        'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION', 
                        'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT', 
                        'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES', 
                        'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT', 
                        'NORTHEAST', 'PLUMBING', 'HEATING', 'TRUST']

    data = []
    for _ in range(num_records):
        if random.random() < 0.5:
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return data

# Load and preprocess data
df = pd.read_excel("Training Data Scrubbing Names 2.xlsx")

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Shuffle and split the data into training and test sets
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y.iloc[indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=2000, penalty='l2', C=0.35, solver='newton-cholesky')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(2000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ak758\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model Accuracy on Test Set: 98.46%
              precision    recall  f1-score   support

Company Name       0.99      0.98      0.98      2899
  Human Name       0.98      0.99      0.99      3142

    accuracy                           0.98      6041
   macro avg       0.98      0.98      0.98      6041
weighted avg       0.98      0.98      0.98      6041

[[2834   65]
 [  28 3114]]
Accuracy on Synthetic Data: 93.70%
              precision    recall  f1-score   support

Company Name       0.95      0.92      0.94       989
  Human Name       0.92      0.95      0.94      1011

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000

[[909  80]
 [ 46 965]]


- Saving the model


In [None]:
# Save the model
with open("name_classifier_logreg.pickle", "wb") as f:
    pickle.dump(model, f)

- Loading the model

In [70]:
# load the model 
model = pickle.load(open('name_classifier_logreg.pickle', 'rb'))


# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(10000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]
# Evaluate on synthetic data
synthetic_pred = model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

Accuracy on Synthetic Data: 93.48%
              precision    recall  f1-score   support

Company Name       0.97      0.90      0.93      5031
  Human Name       0.91      0.97      0.94      4969

    accuracy                           0.93     10000
   macro avg       0.94      0.94      0.93     10000
weighted avg       0.94      0.93      0.93     10000

[[4534  497]
 [ 155 4814]]
