## Importing the Libraries

In [1]:
import pandas as pd
import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import spacy
import numpy as np

## Reading the dataset

In [2]:
# Load dataset from the online link
df = pd.read_csv(r"C:\Users\admin\Downloads\Gender_Data.csv")

# Ensure the dataset has the correct columns
df.head()  # Display the first few rows of the dataset

Unnamed: 0,Name,Gender
0,Aaban,0
1,Aabharan,0
2,Aabhas,0
3,Aabhat,0
4,Aabheer,0


In [3]:
df.shape

(53982, 2)

## Data Preprocessing

In [4]:
# Prepare names list
names_list = list(zip(df['Name'], df['Gender']))

In [5]:
# Feature extraction functions
def gender_features_last_letter(name):
    return {'last_letter': name[-1].lower()}

def gender_features_last_two_letters(name):
    return {'last_two_letters': name[-2:].lower()}

# Prepare features and labels for variation 1
featuresets1 = [(gender_features_last_letter(name), gender) for (name, gender) in names_list]
random.shuffle(featuresets1)
train_set1, test_set1 = featuresets1[8000:], featuresets1[:2000]

# Prepare features and labels for variation 2
featuresets2 = [(gender_features_last_two_letters(name), gender) for (name, gender) in names_list]
random.shuffle(featuresets2)
train_set2, test_set2 = featuresets2[8000:], featuresets2[:2000]

## Training the Model (Naive Bayes Classifier) and Evaluating

In [6]:
# Train Naive Bayes Classifier for Variation 1
classifier1 = NaiveBayesClassifier.train(train_set1)
print("Naive Bayes Accuracy (Last Letter):", np.round(accuracy(classifier1, test_set1), 2))

# Train Naive Bayes Classifier for Variation 2
classifier2 = NaiveBayesClassifier.train(train_set2)
print("Naive Bayes Accuracy (Last Two Letters):", np.round(accuracy(classifier2, test_set2), 2))

Naive Bayes Accuracy (Last Letter): 0.89
Naive Bayes Accuracy (Last Two Letters): 0.91


In [7]:
# Assuming classifier1 and classifier2 return 0 for Male and 1 for Female

def gender_name_map(prediction):
    return "Male" if prediction == 0 else "Female"

custom_name = input("Enter a name: ")

# Get the predicted results
prediction_last_letter = classifier1.classify(gender_features_last_letter(custom_name))
prediction_last_two_letters = classifier2.classify(gender_features_last_two_letters(custom_name))

# Print the mapped results
print("Predicted gender (Last Letter):", gender_name_map(prediction_last_letter))
print("Predicted gender (Last Two Letters):", gender_name_map(prediction_last_two_letters))

Enter a name: Upama
Predicted gender (Last Letter): Female
Predicted gender (Last Two Letters): Female


## Training the Model (Sklearn Clasifier) and Evaluating

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import spacy

# Assuming names_list and df are already defined, else they need to be defined
# df['Name'] contains names, df['Gender'] contains gender labels, names_list contains name and gender tuples

# Feature extraction functions for last letter and last two letters
def last_letter(name):
    return name[-1].lower()

def last_two_letters(name):
    return name[-2:].lower()

# Apply feature extraction
X_train1 = [last_letter(name) for name, _ in names_list]
X_test1 = [last_letter(name) for name in df['Name']]
X_train2 = [last_two_letters(name) for name, _ in names_list]
X_test2 = [last_two_letters(name) for name in df['Name']]

# Assuming y_train and y_test are predefined
y_train = [gender for _, gender in names_list]
y_test = df['Gender'].tolist()

# OneHotEncoder for the last letter or last two letters
encoder = OneHotEncoder(sparse_output=False)

# Transforming last letters for logistic regression
X_train1_encoded = encoder.fit_transform(pd.DataFrame(X_train1))
X_test1_encoded = encoder.transform(pd.DataFrame(X_test1))

# Transforming last two letters for logistic regression
X_train2_encoded = encoder.fit_transform(pd.DataFrame(X_train2))
X_test2_encoded = encoder.transform(pd.DataFrame(X_test2))

In [9]:
# Logistic Regression for Last Letter
log_reg1 = LogisticRegression(max_iter=200)
log_reg1.fit(X_train1_encoded, y_train)
y_pred_log1 = log_reg1.predict(X_test1_encoded)
print("Logistic Regression Accuracy (Last Letter):", np.round(accuracy_score(y_test, y_pred_log1), 2))

# Logistic Regression for Last Two Letters
log_reg2 = LogisticRegression(max_iter=200)
log_reg2.fit(X_train2_encoded, y_train)
y_pred_log2 = log_reg2.predict(X_test2_encoded)
print("Logistic Regression Accuracy (Last Two Letters):", np.round(accuracy_score(y_test, y_pred_log2), 2))

# Random Forest for Last Letter
rf_clf1 = RandomForestClassifier()
rf_clf1.fit(X_train1_encoded, y_train)
y_pred_rf1 = rf_clf1.predict(X_test1_encoded)
print("Random Forest Accuracy (Last Letter):", np.round(accuracy_score(y_test, y_pred_rf1), 2))

# Random Forest for Last Two Letters
rf_clf2 = RandomForestClassifier()
rf_clf2.fit(X_train2_encoded, y_train)
y_pred_rf2 = rf_clf2.predict(X_test2_encoded)
print("Random Forest Accuracy (Last Two Letters):", np.round(accuracy_score(y_test, y_pred_rf2), 2))

Logistic Regression Accuracy (Last Letter): 0.9
Logistic Regression Accuracy (Last Two Letters): 0.91
Random Forest Accuracy (Last Letter): 0.9
Random Forest Accuracy (Last Two Letters): 0.91


## Spacy Model

In [10]:
# Load SpaCy model (English)
nlp = spacy.load("en_core_web_sm")

def spacy_predict(name):
    doc = nlp(name)
    # Simple heuristic based on last letter
    if doc[0].text[-1].lower() in ['a', 'e', 'i']:
        return "Female"
    else:
        return "Male"

# Custom input prediction with SpaCy
custom_name = input("Enter a name: ")
spacy_gender = spacy_predict(custom_name)
print("Predicted gender (SpaCy):", spacy_gender)

Enter a name: Ayush
Predicted gender (SpaCy): Male


Summary of Observations:

Naive Bayes Classifier: 
Accuracy (Last Letter): 0.86; Naive Bayes Accuracy (Last Two Letters): 0.88

SK Learn Classifier:
Logistic Regression Accuracy (Last Letter): 0.9; Logistic Regression Accuracy (Last Two Letters): 0.91
Random Forest Accuracy (Last Letter): 0.9; Random Forest Accuracy (Last Two Letters): 0.91

We can see that considering the last two letters of the names provide better accuracy than considering the last letter.