In [2]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data from the CSV file
data = pd.read_csv("data/1999.csv")

# Preprocess the data
name_lengths = data["Name"].str.len()
max_length = name_lengths.max()

# Convert names to character-level sequences
char_to_idx = {char: idx for idx, char in enumerate(set(''.join(data["Name"])))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
X = torch.zeros((len(data), max_length), dtype=torch.long)
for i, name in enumerate(data["Name"]):
    for j, char in enumerate(name):
        X[i, j] = char_to_idx[char]




In [3]:
# Convert gender to binary labels
gender_to_idx = {"m": 0, "f": 1}
y = torch.tensor([gender_to_idx[gender] for gender in data["Gender"]])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
class GenderPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GenderPredictor, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.fc(hidden.squeeze(0))
        return x

# Set the hyperparameters
input_size = len(char_to_idx)
hidden_size = 128
output_size = 2
learning_rate = 0.001
num_epochs = 3
batch_size = 32

# Initialize the model
model = GenderPredictor(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [4]:
# Train the model
for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate the model
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == y_test).sum().item() / len(y_test)
    print(f"Test Accuracy: {accuracy:.4f}")

Epoch [1/3], Loss: 0.0070
Epoch [2/3], Loss: 0.0060
Epoch [3/3], Loss: 0.0062
Test Accuracy: 0.9913


In [16]:
import torch
import pandas as pd

# Function to preprocess a single name
def preprocess_name(name):
    # Initialize tensor with zeros
    name_tensor = torch.zeros(1, len(name), dtype=torch.long)

    # Iterate through each character and assign its index to the tensor
    for j, char in enumerate(name):
        if char in char_to_idx:
            name_tensor[0, j] = char_to_idx[char]
    return name_tensor

# Load the file with names and genders
# test_data = pd.read_csv("data/1999.csv")  # Replace "test_data.csv" with your file path
test_data = pd.read_csv("data/2014.csv")  # Replace "test_data.csv" with your file path
# test_data = pd.read_csv("generated/2014_output.csv")  # Replace "test_data.csv" with your file path
# test_data = pd.read_csv("generated/1999_output.csv")  # Replace "test_data.csv" with your file path
# test_data = pd.read_csv("samples/usa.csv")  # Replace "test_data.csv" with your file path



# Set the model to evaluation mode
model.eval()

# Initialize variables to keep track of predictions
total_predictions = 0
correct_predictions = 0
accuracy = 1

f_predict = 0
f_correct = 0
f_accuracy = 1



# Iterate over each row in the test data
for _, row in test_data.iterrows():
    name = row["Name"]
    actual_gender = row["Gender"]

    # Preprocess the name
    name_tensor = preprocess_name(name)

    # Make the prediction
    with torch.no_grad():
        output = model(name_tensor)
        _, predicted = torch.max(output.data, 1)
        predicted_gender = "m" if predicted.item() == 0 else "f"




    if total_predictions == 1000000:
         break
    
    if total_predictions % 1000 == 0:
        print(total_predictions, f_accuracy, accuracy)

    # Update prediction counts
    total_predictions += 1
    if predicted_gender == actual_gender:
        correct_predictions += 1
    
    if actual_gender == "f":
        f_predict += 1
        if predicted_gender == "f":
            f_correct += 1

    accuracy = correct_predictions / total_predictions
    if f_predict:
        f_accuracy = f_correct / f_predict  


m_predict = total_predictions - f_predict
m_correct = correct_predictions - f_correct

# Calculate and print the accuracy
accuracy = correct_predictions / total_predictions
m_accuracy = m_correct / m_predict
f_accuracy = f_correct / f_predict

print(f"\nAccuracy: {accuracy:.4f}")
print(f"\nM_Accuracy: {m_accuracy:.4f}")
print(f"\nF_Accuracy: {f_accuracy:.4f}")


0 1 1
1000 0.97 0.97
2000 0.982 0.982
3000 0.986 0.986
4000 0.97275 0.97275
5000 0.9782 0.9782
6000 0.9811666666666666 0.9811666666666666
7000 0.9838571428571429 0.9838571428571429
8000 0.985875 0.985875
9000 0.9865555555555555 0.9865555555555555
10000 0.9879 0.9879
11000 0.989 0.989
12000 0.9899166666666667 0.9899166666666667
13000 0.9835384615384616 0.9835384615384616
14000 0.9811428571428571 0.9811428571428571
15000 0.9808 0.9808
16000 0.9766875 0.9766875
17000 0.9640588235294117 0.9640588235294117
18000 0.9659444444444445 0.9659444444444445
19000 0.9673157894736842 0.9673157894736842
20000 0.9686 0.9686
21000 0.9700952380952381 0.9700952380952381
22000 0.9714545454545455 0.9714545454545455
23000 0.9726086956521739 0.9726086956521739
24000 0.9735833333333334 0.9735833333333334
25000 0.97348 0.97348
26000 0.9734615384615385 0.9734615384615385
27000 0.9744444444444444 0.9744444444444444
28000 0.9687142857142857 0.9687142857142857
29000 0.9697931034482759 0.9697931034482759
30000 0.967