In [9]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import pickle

from scipy.stats import randint, uniform

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)


TRAINING_FILE_PATH = "Training Data Scrubbing Names 3.xlsx"

- Enhancing the model with States, Cities and company abbreviations

In [10]:

# List of U.S. states and their abbreviations
us_states = [
    'Alabama', 'AL', 'Alaska', 'AK', 'Arizona', 'AZ', 'Arkansas', 'AR', 'California', 'CA', 'Colorado', 'CO',
    'Connecticut', 'CT', 'Delaware', 'DE', 'Florida', 'FL', 'Georgia', 'GA', 'Hawaii', 'HI', 'Idaho', 'ID',
    'Illinois', 'IL', 'Indiana', 'IN', 'Iowa', 'IA', 'Kansas', 'KS', 'Kentucky', 'KY', 'Louisiana', 'LA',
    'Maine', 'ME', 'Maryland', 'MD', 'Massachusetts', 'MA', 'Michigan', 'MI', 'Minnesota', 'MN', 'Mississippi', 'MS',
    'Missouri', 'MO', 'Montana', 'MT', 'Nebraska', 'NE', 'Nevada', 'NV', 'New Hampshire', 'NH', 'New Jersey', 'NJ',
    'New Mexico', 'NM', 'New York', 'NY', 'North Carolina', 'NC', 'North Dakota', 'ND', 'Ohio', 'OH', 'Oklahoma', 'OK',
    'Oregon', 'OR', 'Pennsylvania', 'PA', 'Rhode Island', 'RI', 'South Carolina', 'SC', 'South Dakota', 'SD', 'Tennessee', 'TN',
    'Texas', 'TX', 'Utah', 'UT', 'Vermont', 'VT', 'Virginia', 'VA', 'Washington', 'WA', 'West Virginia', 'WV', 'Wisconsin', 'WI', 'Wyoming', 'WY'
]

# List of major U.S. cities
us_cities = [
    'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas',
    'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 'San Francisco', 'Charlotte', 'Indianapolis', 'Seattle',
    'Denver', 'Washington', 'Boston', 'El Paso', 'Nashville', 'Detroit', 'Oklahoma City', 'Portland', 'Las Vegas',
    'Memphis', 'Louisville', 'Baltimore', 'Milwaukee', 'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Kansas City',
    'Atlanta', 'Miami', 'Colorado Springs', 'Raleigh', 'Omaha', 'Long Beach', 'Virginia Beach', 'Oakland', 'Minneapolis',
    'Tulsa', 'Arlington', 'Tampa', 'New Orleans', 'Wichita'
]

# List of common company-related abbreviations
company_abbr = [
    'Co', 'Inc', 'LLC', 'Ltd', 'Corp', 'Pty', 'PLC', 'GmbH', 'S.A.', 'S.A.S.', 'AG', 'N.V.', 'B.V.', 'K.K.', 'S.R.L.', 'P.C.',
    'C.A.', 'd.o.o.', 'P.L.C.', 'S.p.A.', 'A.G.', 'a.s.', 'OÜ', 'Oy', 'ApS', 's.r.o.', 'S.A.B.', 'S.L.', 'AB', 'CO', 'BK'
]

# Combine all keywords
company_keywords = [
    'OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE',
    'COMPANY', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE',
    'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS',
    'WELLNESS', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORPORATION',
    'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT',
    'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES',
    'CORPORATE', 'DISTRICT', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT',
    'NORTHEAST', 'PLUMBING', 'HEATING', "T V A", 'PARK', 'DETROIT', 'TOWNSHIP', 'CASS',
    'ILE', 'BRANCH', 'ST', 'HCMA', 'ITC', 'TRANSMISSION', 'ENTERPRISES', 'TWP', 'FARMS', 'AUTHORITY',
    'BERRIEN', 'CALHOUN', 'KALAMAZOO', 'CENTER', 'NATURE', 'KALAMAZOO', 'NORTHWESTERN', 'COUNTY', 'WASHTENAW'
    ] + us_states + us_cities + company_abbr

# Function to generate variations of company keywords
def generate_variations(keyword):
    variations = [
        keyword,
        keyword.lower(),
        # keyword.upper(),
        # keyword.capitalize(),
    #     keyword.replace(" ", ""),
    #     keyword.replace(" ", "."),
    #     keyword.replace(" ", "_"),
    #     f"{keyword}.",
    #     f".{keyword}",
    #     f"{keyword}.com",
    #     f"_{keyword}_",
    #     f"{keyword} Co",
    #     f"{keyword} Inc.",
    #     f"{keyword} LLC",
    #     f"{keyword}, Inc.",
    #     f"{keyword}, LLC",
    #     f"{keyword}, Ltd.",
    #     f"{keyword} LTD"
    ]
    return variations

# Augment company names using variations of keywords
def augment_company_names(keywords, num_samples):
    augmented_data = []
    keyword_variations = [variation for keyword in keywords for variation in generate_variations(keyword)]
    for _ in range(num_samples):
        name_parts = random.sample(keyword_variations, k=1)  # Combine 3 random variations
        company_name = " ".join(name_parts)
        augmented_data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return augmented_data

- A systematic way to test the model

In [11]:
import random

# Generate synthetic data for testing
def generate_synthetic_data(num_records=500):
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    company_keywords = ['OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE',
                        'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE',
                        'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS',
                        'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION',
                        'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT',
                        'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES',
                        'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT',
                        'NORTHEAST', 'PLUMBING', 'HEATING', "T V A",     'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas',
                        'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 'San Francisco', 'Charlotte', 'Indianapolis', 'Seattle',
                        'Denver', 'Washington', 'Boston', 'El Paso', 'Nashville', 'Detroit', 'Oklahoma City', 'Portland', 'Las Vegas',
                        'Memphis', 'Louisville', 'Baltimore', 'Milwaukee', 'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Kansas City',
                        'Atlanta', 'Miami', 'Colorado Springs', 'Raleigh', 'Omaha', 'Long Beach', 'Virginia Beach', 'Oakland', 'Minneapolis',
                        'Tulsa', 'Arlington', 'Tampa', 'New Orleans', 'Wichita',
                        'Alabama', 'AL', 'Alaska', 'AK', 'Arizona', 'AZ', 'Arkansas', 'AR', 'California', 'CA', 'Colorado', 'CO',
                        'Connecticut', 'CT', 'Delaware', 'DE', 'Florida', 'FL', 'Georgia', 'GA', 'Hawaii', 'HI', 'Idaho', 'ID',
                        'Illinois', 'IL', 'Indiana', 'IN', 'Iowa', 'IA', 'Kansas', 'KS', 'Kentucky', 'KY', 'Louisiana', 'LA',
                        'Maine', 'ME', 'Maryland', 'MD', 'Massachusetts', 'MA', 'Michigan', 'MI', 'Minnesota', 'MN', 'Mississippi', 'MS',
                        'Missouri', 'MO', 'Montana', 'MT', 'Nebraska', 'NE', 'Nevada', 'NV', 'New Hampshire', 'NH', 'New Jersey', 'NJ',
                        'New Mexico', 'NM', 'New York', 'NY', 'North Carolina', 'NC', 'North Dakota', 'ND', 'Ohio', 'OH', 'Oklahoma', 'OK',
                        'Oregon', 'OR', 'Pennsylvania', 'PA', 'Rhode Island', 'RI', 'South Carolina', 'SC', 'South Dakota', 'SD', 'Tennessee', 'TN',
                        'Texas', 'TX', 'Utah', 'UT', 'Vermont', 'VT', 'Virginia', 'VA', 'Washington', 'WA', 'West Virginia', 'WV', 'Wisconsin', 'WI', 'Wyoming', 'WY',
                        'Co', 'Inc', 'LLC', 'Ltd', 'Corp', 'Pty', 'PLC', 'GmbH', 'S.A.', 'S.A.S.', 'AG', 'N.V.', 'B.V.', 'K.K.', 'S.R.L.', 'P.C.',
                        'C.A.', 'd.o.o.', 'P.L.C.', 'S.p.A.', 'A.G.', 'a.s.', 'OÜ', 'Oy', 'ApS', 's.r.o.', 'S.A.B.', 'S.L.', 'AB',
                        'CASS', 'ILE', 'BRANCH', 'ST', 'HCMA', 'ITC', 'TRANSMISSION', 'ENTERPRISES']

    data = []
    for _ in range(num_records):
        if random.random() < 0.5:
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return data


### Interpretation of the Results:

#### Logistic Regression on Test Set:

**Accuracy on Test Set: 99.86%**

**Precision, Recall, and F1-Score for Each Class:**

1. **Company Name:**
   - **Precision:** 1.00
   - **Recall:** 1.00
   - **F1-Score:** 1.00
   - **Support:** 1,007,868

2. **Human Name:**
   - **Precision:** 0.98
   - **Recall:** 0.90
   - **F1-Score:** 0.94
   - **Support:** 11,878

**Overall Metrics:**
   - **Accuracy:** 99.86%
   - **Macro Average Precision:** 0.99
   - **Macro Average Recall:** 0.95
   - **Macro Average F1-Score:** 0.97
   - **Weighted Average Precision:** 1.00
   - **Weighted Average Recall:** 1.00
   - **Weighted Average F1-Score:** 1.00

**Confusion Matrix:**
```
[[1007662     206]
 [   1219   10659]]
```
- **True Negatives (Company Name correctly identified):** 1,007,662
- **False Positives (Human Name incorrectly identified as Company Name):** 206
- **False Negatives (Company Name incorrectly identified as Human Name):** 1,219
- **True Positives (Human Name correctly identified):** 10,659

**Analysis:**
- The model performs exceptionally well on the test set, with an accuracy of 99.86%.
- Precision for both classes is very high, indicating that the model is good at correctly identifying positive samples.
- Recall for "Human Name" is slightly lower at 0.90, suggesting some instances of human names are missed (i.e., classified as company names).
- F1-score is high for both classes, indicating a good balance between precision and recall.

#### Logistic Regression on Synthetic Data:

**Accuracy on Synthetic Data: 96.18%**

**Precision, Recall, and F1-Score for Each Class:**

1. **Company Name:**
   - **Precision:** 0.99
   - **Recall:** 0.93
   - **F1-Score:** 0.96
   - **Support:** 24,972

2. **Human Name:**
   - **Precision:** 0.94
   - **Recall:** 0.99
   - **F1-Score:** 0.96
   - **Support:** 25,028

**Overall Metrics:**
   - **Accuracy:** 96.18%
   - **Macro Average Precision:** 0.96
   - **Macro Average Recall:** 0.96
   - **Macro Average F1-Score:** 0.96
   - **Weighted Average Precision:** 0.96
   - **Weighted Average Recall:** 0.96
   - **Weighted Average F1-Score:** 0.96

**Confusion Matrix:**
```
[[23302  1670]
 [  242 24786]]
```
- **True Negatives (Company Name correctly identified):** 23,302
- **False Positives (Human Name incorrectly identified as Company Name):** 1,670
- **False Negatives (Company Name incorrectly identified as Human Name):** 242
- **True Positives (Human Name correctly identified):** 24,786

**Analysis:**
- The model performs well on the synthetic data with an accuracy of 96.18%.
- Precision and recall are high for both classes, indicating the model's effectiveness in distinguishing between human and company names.
- The slight decrease in performance compared to the test set might be due to differences between the real and synthetic data or the complexity of the synthetic data.

### Conclusion:
- The logistic regression model is highly effective at distinguishing between human and company names, particularly on the test set with an accuracy of 99.86%.
- On synthetic data, the model's accuracy drops slightly to 96.18%, but still maintains high precision, recall, and F1-scores.
- While the recall for "Human Name" on the test set is slightly lower, the overall performance is strong, making the model robust for practical use in classifying human names from company names.

## Models Creation

In [12]:
# Load and preprocess data
df = pd.read_excel(TRAINING_FILE_PATH)
# df.drop_duplicates(inplace=True)
df['Full Name'] = df['Full Name'].replace(',', '')
df['Full Name'] = df['Full Name'].replace('&', '')

# Separate the DataFrame into two parts
company_df = df[df['Name Type'] == 'Company Name']
human_df = df[df['Name Type'] == 'Human Name']

# Drop duplicates in the company names part
company_df = company_df.drop_duplicates(subset='Full Name')

# Concatenate the filtered DataFrames back together
df = pd.concat([company_df, human_df], ignore_index=True)

augmented_data = augment_company_names(company_keywords, num_samples=5_000_000)

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Concatenate the original and augmented data
df = pd.concat([df, augmented_df], ignore_index=True)

synthetic_data = generate_synthetic_data(50000)

# Vectorization using an improved TF-IDF approach
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # Experiment with different n-gram ranges
    analyzer='char_wb',  # Character-level analyzer with word boundaries
    min_df=2,  # Ignore terms that appear in less than 3 documents
    max_df=0.65,  # Ignore terms that appear in more than 65% of the documents
    sublinear_tf=True  # Apply sublinear term frequency scaling
)
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Define individual models with hyperparameter tuning
logreg = LogisticRegression(max_iter=10000, penalty='l2', C=0.35, solver='newton-cg')

# Train the ensemble model
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"logreg Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(50000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = logreg.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

# Save the ensemble model and vectorizer
with open("logreg_classifier.pickle", "wb") as model_file:
    pickle.dump(logreg, model_file)

with open("logreg_vectorizer.pickle", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

logreg Accuracy on Test Set: 99.86%
              precision    recall  f1-score   support

Company Name       1.00      1.00      1.00   1007868
  Human Name       0.98      0.90      0.94     11878

    accuracy                           1.00   1019746
   macro avg       0.99      0.95      0.97   1019746
weighted avg       1.00      1.00      1.00   1019746

[[1007662     206]
 [   1219   10659]]
Accuracy on Synthetic Data: 96.18%
              precision    recall  f1-score   support

Company Name       0.99      0.93      0.96     24972
  Human Name       0.94      0.99      0.96     25028

    accuracy                           0.96     50000
   macro avg       0.96      0.96      0.96     50000
weighted avg       0.96      0.96      0.96     50000

[[23302  1670]
 [  242 24786]]
