In [11]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from scipy.stats import randint, uniform

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)


TRAINING_FILE_PATH = "Training Data Scrubbing Names 3.xlsx"

In [12]:

# List of U.S. states and their abbreviations
us_states = [
    'Alabama', 'AL', 'Alaska', 'AK', 'Arizona', 'AZ', 'Arkansas', 'AR', 'California', 'CA', 'Colorado', 'CO',
    'Connecticut', 'CT', 'Delaware', 'DE', 'Florida', 'FL', 'Georgia', 'GA', 'Hawaii', 'HI', 'Idaho', 'ID',
    'Illinois', 'IL', 'Indiana', 'IN', 'Iowa', 'IA', 'Kansas', 'KS', 'Kentucky', 'KY', 'Louisiana', 'LA',
    'Maine', 'ME', 'Maryland', 'MD', 'Massachusetts', 'MA', 'Michigan', 'MI', 'Minnesota', 'MN', 'Mississippi', 'MS',
    'Missouri', 'MO', 'Montana', 'MT', 'Nebraska', 'NE', 'Nevada', 'NV', 'New Hampshire', 'NH', 'New Jersey', 'NJ',
    'New Mexico', 'NM', 'New York', 'NY', 'North Carolina', 'NC', 'North Dakota', 'ND', 'Ohio', 'OH', 'Oklahoma', 'OK',
    'Oregon', 'OR', 'Pennsylvania', 'PA', 'Rhode Island', 'RI', 'South Carolina', 'SC', 'South Dakota', 'SD', 'Tennessee', 'TN',
    'Texas', 'TX', 'Utah', 'UT', 'Vermont', 'VT', 'Virginia', 'VA', 'Washington', 'WA', 'West Virginia', 'WV', 'Wisconsin', 'WI', 'Wyoming', 'WY'
]

# List of major U.S. cities
us_cities = [
    'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas',
    'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 'San Francisco', 'Charlotte', 'Indianapolis', 'Seattle',
    'Denver', 'Washington', 'Boston', 'El Paso', 'Nashville', 'Detroit', 'Oklahoma City', 'Portland', 'Las Vegas',
    'Memphis', 'Louisville', 'Baltimore', 'Milwaukee', 'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Kansas City',
    'Atlanta', 'Miami', 'Colorado Springs', 'Raleigh', 'Omaha', 'Long Beach', 'Virginia Beach', 'Oakland', 'Minneapolis',
    'Tulsa', 'Arlington', 'Tampa', 'New Orleans', 'Wichita'
]

# List of common company-related abbreviations
company_abbr = [
    'Co', 'Inc', 'LLC', 'Ltd', 'Corp', 'Pty', 'PLC', 'GmbH', 'S.A.', 'S.A.S.', 'AG', 'N.V.', 'B.V.', 'K.K.', 'S.R.L.', 'P.C.',
    'C.A.', 'd.o.o.', 'P.L.C.', 'S.p.A.', 'A.G.', 'a.s.', 'OÜ', 'Oy', 'ApS', 's.r.o.', 'S.A.B.', 'S.L.', 'AB'
]

# Combine all keywords
company_keywords = [
    'OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE',
    'COMPANY', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE',
    'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS',
    'WELLNESS', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORPORATION',
    'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT',
    'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES',
    'CORPORATE', 'DISTRICT', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT',
    'NORTHEAST', 'PLUMBING', 'HEATING', "T V A"
] + us_states + us_cities + company_abbr

# Function to generate variations of company keywords
def generate_variations(keyword):
    variations = [
        keyword,
        keyword.lower(),
        keyword.upper(),
        keyword.capitalize(),
        keyword.replace(" ", ""),
        keyword.replace(" ", "."),
        keyword.replace(" ", "_"),
        f"{keyword}.",
        f".{keyword}",
        f"{keyword}.com",
        f"_{keyword}_",
        f"{keyword} Co",
        f"{keyword} Inc.",
        f"{keyword} LLC",
        f"{keyword}, Inc.",
        f"{keyword}, LLC",
        f"{keyword}, Ltd.",
        f"{keyword} LTD"
    ]
    return variations

# Augment company names using variations of keywords
def augment_company_names(keywords, num_samples):
    augmented_data = []
    keyword_variations = [variation for keyword in keywords for variation in generate_variations(keyword)]
    for _ in range(num_samples):
        name_parts = random.sample(keyword_variations, k=3)  # Combine 3 random variations
        company_name = " ".join(name_parts)
        augmented_data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return augmented_data

# Interpretation:
- High Precision for Human Names: The model is very precise in identifying human names, with few false positives.
- High Recall for Company Names: The model correctly identifies most company names, missing only a small percentage.
- Lower Recall for Human Names: The model is less effective at identifying all human names, as indicated by the lower recall.

- ***Overall, the model performs well with a balanced F1-score of 0.94 for both classes. However, it is slightly better at classifying company names (higher recall) than human names. Further tuning, such as optimizing the decision threshold or incorporating more features, could improve performance.***

- A systematic way to test the model

In [13]:
import random

# Generate synthetic data for testing
def generate_synthetic_data(num_records=500):
    human_first_names = ["John", "Jane", "Alex", "Emily", "Michael", "Sarah", "David", "Laura", "James", "Emma"]
    human_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
    company_keywords = ['OF', 'SERVICES', 'COUNTY', 'DISTRIBUTOR', 'PRODUCTS', 'COUNTRY CLUB', 'PLLC', 'PRIVATE', 
                        'COMPANY', 'INC', 'RETIREMENT', 'DEVELOPMENT', 'HOA', 'AUTHORITY', 'CONF', 'CONFERENCE', 
                        'CONSTRUCTION', 'AFFORDABLE', 'HOUSING', 'MID COAST', 'ESTATE', 'REHABILITATION', 'GARDENS', 
                        'WELLNESS', 'LLC', 'STATE', 'LAND', 'MEMBERSHIP', 'COOPERATIVE', 'CORP', 'CORPORATION', 
                        'INDEPENDENT', 'CHURCH', 'ENTERPRISES', 'ACCOUNTING', 'INVESTMENTS', 'OWNERS', 'AIRPORT', 
                        'MAINTENANCE', 'ASSOCIATION', 'REALTY', 'FOUNDATION', 'CONSULTANTS', 'ASSOCIATES', 
                        'CORPORATE', 'DISTRICT', 'LTD', 'LIMITED', 'INCORPORATED', 'PROPERTIES', 'INVESTMENT', 
                        'NORTHEAST', 'PLUMBING', 'HEATING', "T V A"]

    data = []
    for _ in range(num_records):
        if random.random() < 0.5:
            first_name = random.choice(human_first_names)
            last_name = random.choice(human_last_names)
            full_name = f"{first_name} {last_name}"
            data.append({"Full Name": full_name, "Name Type": "Human Name"})
        else:
            company_name = f"{random.choice(human_last_names)} {random.choice(company_keywords)}"
            data.append({"Full Name": company_name, "Name Type": "Company Name"})
    return data


- Models Comparison to see which models are best performing

In [14]:
# Load and preprocess data
df = pd.read_excel(TRAINING_FILE_PATH)
df.drop_duplicates(inplace=True)
df['Full Name'] = df['Full Name'].replace(',', '')
df['Full Name'] = df['Full Name'].replace('&', '')


augmented_data = augment_company_names(company_keywords, num_samples=20000)

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Concatenate the original and augmented data
df = pd.concat([df, augmented_df], ignore_index=True)

synthetic_data = generate_synthetic_data(10000)

# Vectorization using an improved TF-IDF approach
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # Experiment with different n-gram ranges
    analyzer='char_wb',  # Character-level analyzer with word boundaries
    min_df=3,  # Ignore terms that appear in less than 3 documents
    max_df=0.7,  # Ignore terms that appear in more than 70% of the documents
    sublinear_tf=True  # Apply sublinear term frequency scaling
)

X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Prepare synthetic data for evaluation
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Models to evaluate
models = [
    ('KNN', KNeighborsClassifier()),
    ('SVC', SVC()),
    ('DecisionTree', DecisionTreeClassifier()),
    ('RandomForest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('GradientBoost', GradientBoostingClassifier()),
    ('GaussianNB', GaussianNB()),
    ('LDA', LinearDiscriminantAnalysis()),
    ('LogisticRegression', LogisticRegression(max_iter=5000)),
    ('MLP', MLPClassifier(max_iter=500))
]

# Evaluate each model with KFold cross-validation
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=5, random_state=0, shuffle=True)

    # Convert to dense array for models that require it
    if name in ['GaussianNB', 'LDA']:
        X_dense = X.toarray()
        synthetic_X_dense = synthetic_X.toarray()
        cv_results = cross_val_score(model, X_dense, y, cv=kfold, scoring=scoring)
    else:
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)

    results.append(cv_results)
    names.append(name)
    print(f"{name}: CV Accuracy: {np.round(cv_results.mean(), 3)} (+/- {np.round(cv_results.std(), 3)})")

    # Train model on the entire dataset and evaluate on synthetic data
    model.fit(X_dense if name in ['GaussianNB', 'LDA'] else X, y)
    synthetic_pred = model.predict(synthetic_X_dense if name in ['GaussianNB', 'LDA'] else synthetic_X)
    synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
    print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
    print(classification_report(synthetic_y, synthetic_pred))
    print(confusion_matrix(synthetic_y, synthetic_pred))
    print("\n" + "-"*40 + "\n")

# Boxplot algorithm comparison
fig = plt.figure(figsize=(15, 10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('Models_Comparison.png', format='png')
plt.show()


KNN: CV Accuracy: 0.98 (+/- 0.001)
Accuracy on Synthetic Data: 88.35%
              precision    recall  f1-score   support

Company Name       0.85      0.93      0.89      5029
  Human Name       0.92      0.84      0.88      4971

    accuracy                           0.88     10000
   macro avg       0.89      0.88      0.88     10000
weighted avg       0.89      0.88      0.88     10000

[[4669  360]
 [ 805 4166]]

----------------------------------------



- Using Gridsearch to get the best parameters for each model

In [None]:

param_grid_svc = {
    'C': [0.1, 0.5, 1, 5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svc = GridSearchCV(SVC(), param_grid_svc, cv=5, scoring='accuracy')
grid_svc.fit(X_train, y_train)
best_svc = grid_svc.best_estimator_
print("Best parameters for SVC:", grid_svc.best_params_)


Best parameters for SVC: {'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}


In [None]:

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print("Best parameters for Random Forest:", grid_rf.best_params_)


Best parameters for Random Forest: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


- Enhancing the model with States, Cities and company abbreviations

In [None]:


# Load and preprocess data
df = pd.read_excel(TRAINING_FILE_PATH)
df.drop_duplicates(inplace=True)
df['Full Name'] = df['Full Name'].replace(',', '')
df['Full Name'] = df['Full Name'].replace('&', '')

augmented_data = augment_company_names(company_keywords, num_samples=20000)

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Concatenate the original and augmented data
df = pd.concat([df, augmented_df], ignore_index=True)

# Vectorization using an improved TF-IDF approach
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # Experiment with different n-gram ranges
    analyzer='char_wb',  # Character-level analyzer with word boundaries
    min_df=3,  # Ignore terms that appear in less than 3 documents
    max_df=0.7,  # Ignore terms that appear in more than 70% of the documents
    sublinear_tf=True  # Apply sublinear term frequency scaling
)
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Shuffle and split the data into training and test sets
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y.iloc[indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Define individual models with hyperparameter tuning
logreg = LogisticRegression(max_iter=5000, penalty='l2', C=0.35, solver='newton-cg')
mlp = MLPClassifier(hidden_layer_sizes=(500,), max_iter=2000, alpha=0.001, solver='adam')
svc = SVC(C=5, gamma='scale', kernel='rbf', probability=True)
rf = RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200)
gb = GradientBoostingClassifier(n_estimators=300, random_state=42)

# Create an ensemble model using StackingClassifier for advanced blending
ensemble_model = StackingClassifier(
    estimators=[
        ('logreg', logreg),
        ('mlp', mlp),
        ('svc', svc),
        ('rf', rf)
    ],
    final_estimator=gb,
    cv=5
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(10000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = ensemble_model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

# Save the ensemble model and vectorizer
with open("enhanced_ensemble_name_classifier.pickle", "wb") as model_file:
    pickle.dump(ensemble_model, model_file)

with open("vectorizer.pickle", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)


Model Accuracy on Test Set: 98.12%
              precision    recall  f1-score   support

Company Name       0.99      0.97      0.98      8945
  Human Name       0.97      0.99      0.98      7295

    accuracy                           0.98     16240
   macro avg       0.98      0.98      0.98     16240
weighted avg       0.98      0.98      0.98     16240

[[8689  256]
 [  49 7246]]
Accuracy on Synthetic Data: 85.60%
              precision    recall  f1-score   support

Company Name       1.00      0.71      0.83      4893
  Human Name       0.78      1.00      0.88      5107

    accuracy                           0.86     10000
   macro avg       0.89      0.85      0.85     10000
weighted avg       0.89      0.86      0.85     10000

[[3453 1440]
 [   0 5107]]


- Saving the model


In [None]:
# Save the vectorizer
with open("tfidf_vectorizer.pickle", "wb") as f:
    pickle.dump(vectorizer, f)

# Save the model
with open("ensemble_model.pickle", "wb") as f:
    pickle.dump(model, f)

- Loading the model

In [None]:
# load the model 
model = pickle.load(open('name_classifier_logreg1.pickle', 'rb'))


# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(10000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]
# Evaluate on synthetic data
synthetic_pred = model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

# ----------------------------------------------------------------------------------------------------------------------------------------

- Different ways of to enhnce the bagging strategy

In [None]:


# Load and preprocess data
df = pd.read_excel(TRAINING_FILE_PATH)
df.drop_duplicates(inplace=True)
df['Full Name'] = df['Full Name'].replace(',', '')
df['Full Name'] = df['Full Name'].replace('&', '')

augmented_data = augment_company_names(company_keywords, num_samples=20000)

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Concatenate the original and augmented data
df = pd.concat([df, augmented_df], ignore_index=True)

# Vectoxrization using TF-IDF
vectorxizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Shuffle and split the data into training and test sets
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y.iloc[indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Define individual models
logreg = LogisticRegression(max_iter=5000, penalty='l2', C=0.35, solver='newton-cg')
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=0.001, solver='adam', random_state=42)
svc = SVC(probability=True, kernel='linear', C=1.0, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create an ensemble model
ensemble_model = VotingClassifier(
    estimators=[
        ('logreg', logreg),
        ('mlp', mlp),
        ('svc', svc),
        ('rf', rf)
    ],
    voting='soft'  # Use 'soft' voting to consider probabilities
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(10000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = ensemble_model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

# Save the ensemble model and vectorizer
with open("ensemble_name_classifier.pickle", "wb") as model_file:
    pickle.dump(ensemble_model, model_file)

with open("vectorizer.pickle", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)


Ensemble Model Accuracy on Test Set: 99.05%
              precision    recall  f1-score   support

Company Name       1.00      0.99      0.99      8958
  Human Name       0.98      1.00      0.99      7282

    accuracy                           0.99     16240
   macro avg       0.99      0.99      0.99     16240
weighted avg       0.99      0.99      0.99     16240

[[8824  134]
 [  21 7261]]
Accuracy on Synthetic Data: 84.97%
              precision    recall  f1-score   support

Company Name       1.00      0.69      0.82      4893
  Human Name       0.77      1.00      0.87      5107

    accuracy                           0.85     10000
   macro avg       0.89      0.85      0.85     10000
weighted avg       0.88      0.85      0.85     10000

[[3390 1503]
 [   0 5107]]


In [None]:

# Load and preprocess data
df = pd.read_excel(TRAINING_FILE_PATH)
df.drop_duplicates(inplace=True)
df['Full Name'] = df['Full Name'].replace(',', '')
df['Full Name'] = df['Full Name'].replace('&', '')

augmented_data = augment_company_names(company_keywords, num_samples=20000)

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Concatenate the original and augmented data
df = pd.concat([df, augmented_df], ignore_index=True)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
X = vectorizer.fit_transform(df['Full Name'])
y = df['Name Type']

# Shuffle and split the data into training and test sets
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y.iloc[indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Define individual models with hyperparameter tuning
logreg = LogisticRegression(max_iter=5000, penalty='l2', C=0.35, solver='newton-cg')
mlp = MLPClassifier(hidden_layer_sizes=(500,), max_iter=2000, alpha=0.001, solver='adam')
svc = SVC(C=5, gamma='scale', kernel='rbf', probability=True)
rf = RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200)
gb = GradientBoostingClassifier(n_estimators=300, random_state=42)

# Create an ensemble model using StackingClassifier for advanced blending
ensemble_model = StackingClassifier(
    estimators=[
        ('logreg', logreg),
        ('mlp', mlp),
        ('svc', svc),
        ('rf', rf)
    ],
    final_estimator=gb,
    cv=5
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Generate and test with synthetic data
synthetic_data = generate_synthetic_data(10000)
synthetic_X = vectorizer.transform([row['Full Name'] for row in synthetic_data])
synthetic_y = [row['Name Type'] for row in synthetic_data]

# Evaluate on synthetic data
synthetic_pred = ensemble_model.predict(synthetic_X)
synthetic_accuracy = accuracy_score(synthetic_y, synthetic_pred)
print(f"Accuracy on Synthetic Data: {synthetic_accuracy * 100:.2f}%")
print(classification_report(synthetic_y, synthetic_pred))
print(confusion_matrix(synthetic_y, synthetic_pred))

# Save the ensemble model and vectorizer
with open("enhanced_ensemble_name_classifier.pickle", "wb") as model_file:
    pickle.dump(ensemble_model, model_file)

with open("vectorizer.pickle", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)


