In [None]:
# @title loading dataset
!git clone https://huggingface.co/datasets/sameernotes/indian-gender-identification

In [None]:
# @title model traning and sample prediction
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the datasets and print a few examples to verify content
female_df = pd.read_csv('/content/indian-gender-identification/Indian-Female-Names.csv')
male_df = pd.read_csv('/content/indian-gender-identification/Indian-Male-Names.csv')

print("Sample female names:")
print(female_df.head())
print("\nSample male names:")
print(male_df.head())

# Combine the datasets
df = pd.concat([female_df, male_df], ignore_index=True)

# Check the data
print(f"\nTotal samples: {len(df)}")
print(f"Gender distribution:\n{df['gender'].value_counts()}")

# Clean the data: Convert all names to strings and fill NaN values
df['name'] = df['name'].fillna('unknown').astype(str)

# Verify class balance
print(f"\nPercentage of female names: {df[df['gender'] == 'f'].shape[0] / df.shape[0] * 100:.2f}%")
print(f"Percentage of male names: {df[df['gender'] == 'm'].shape[0] / df.shape[0] * 100:.2f}%")

# Convert gender to numerical labels
gender_map = {'f': 0, 'm': 1}
df['gender_label'] = df['gender'].map(gender_map)

# Create character mapping
all_chars = set(''.join(df['name'].str.lower()))
char_to_idx = {char: idx+1 for idx, char in enumerate(sorted(all_chars))}
char_to_idx['<PAD>'] = 0  # Add padding token

# Display character set
print(f"\nTotal unique characters: {len(all_chars)}")
print(f"Characters: {''.join(sorted(all_chars))}")

# Get maximum name length to standardize input size
max_name_length = df['name'].str.len().max()
print(f"Maximum name length: {max_name_length}")

# Character-level tokenization and padding
def tokenize_name(name, max_length=max_name_length):
    name = str(name).lower()  # Ensure it's a string
    tokens = [char_to_idx[char] if char in char_to_idx else char_to_idx.get(' ', 1) for char in name]

    # Pad or truncate to fixed length
    if len(tokens) < max_length:
        tokens = tokens + [0] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length]

    return tokens

# Create custom dataset
class NameDataset(Dataset):
    def __init__(self, names, labels):
        self.names = names
        self.labels = labels

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name_tensor = torch.tensor(tokenize_name(self.names[idx]), dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.float32)
        return name_tensor, label_tensor

# Improved model architecture with CNN layers
class NameGenderClassifierCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters=64, filter_sizes=[2, 3, 4], dropout=0.5):
        super(NameGenderClassifierCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Convolutional layers with different filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])

        # Fully connected layers
        self.fc1 = nn.Linear(len(filter_sizes) * num_filters, 100)
        self.fc2 = nn.Linear(100, 1)

        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch_size, sequence_length)

        # Embedding layer
        x = self.embedding(x)  # (batch_size, sequence_length, embedding_dim)

        # Transpose for convolution
        x = x.transpose(1, 2)  # (batch_size, embedding_dim, sequence_length)

        # Apply convolutions and max-pooling
        conv_outputs = []
        for conv in self.convs:
            conv_out = torch.relu(conv(x))  # (batch_size, num_filters, seq_len - filter_size + 1)
            pool_out = torch.max_pool1d(conv_out, conv_out.shape[2])  # (batch_size, num_filters, 1)
            conv_outputs.append(pool_out.squeeze(2))  # (batch_size, num_filters)

        # Concatenate outputs from different filter sizes
        x = torch.cat(conv_outputs, dim=1)  # (batch_size, num_filters * len(filter_sizes))

        # Fully connected layers with dropout
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return self.sigmoid(x).squeeze()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['name'].values,
    df['gender_label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['gender_label']
)

# Further split training data to get a validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.1,
    random_state=42,
    stratify=y_train
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# Create datasets and dataloaders
train_dataset = NameDataset(X_train, y_train)
val_dataset = NameDataset(X_val, y_val)
test_dataset = NameDataset(X_test, y_test)

# Smaller batch size for better learning
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model and move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model hyperparameters
vocab_size = len(char_to_idx) + 1  # +1 for padding
embedding_dim = 100
num_filters = 64
filter_sizes = [2, 3, 4, 5]  # Multiple filter sizes to capture different n-gram patterns

# Initialize the CNN model
model = NameGenderClassifierCNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    num_filters=num_filters,
    filter_sizes=filter_sizes,
    dropout=0.5
)
model.to(device)

# Count model parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total model parameters: {total_params}")

# Loss function and optimizer
criterion = nn.BCELoss()
# Lower learning rate for more stable learning
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)
# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, verbose=True
)

# Training function with validation
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=15):
    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }

    best_val_acc = 0

    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Track statistics
            train_loss += loss.item() * inputs.size(0)
            predicted = (outputs >= 0.5).float()
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            # Update progress bar
            progress_bar.set_postfix({
                'loss': loss.item(),
                'acc': train_correct/train_total
            })

        # Calculate epoch training statistics
        epoch_train_loss = train_loss / len(train_loader.dataset)
        epoch_train_acc = train_correct / train_total

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]'):
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                predicted = (outputs >= 0.5).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        # Calculate epoch validation statistics
        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_acc = val_correct / val_total

        # Update learning rate based on validation loss
        scheduler.step(epoch_val_loss)

        # Save best model
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f"  --> New best model saved with validation accuracy: {best_val_acc:.4f}")

        # Update history
        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)
        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)

        # Print epoch statistics
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'  Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.4f}')
        print(f'  Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.4f}')

        # Early stopping check
        if epoch > 5 and all(history['val_acc'][-i-1] <= history['val_acc'][-i-2] for i in range(3)):
            print("Early stopping: Validation accuracy hasn't improved for 3 epochs")
            break

    # Load best model
    model.load_state_dict(torch.load('best_model.pt'))
    return history, model

# Evaluation function with more metrics
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc='Evaluating'):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            probs = outputs.cpu().numpy()
            predicted = (outputs >= 0.5).float()

            all_probs.extend(probs)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['Female', 'Male'])

    # Calculate threshold metrics
    thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
    threshold_metrics = {}

    for threshold in thresholds:
        thresh_preds = [1 if p >= threshold else 0 for p in all_probs]
        thresh_acc = accuracy_score(all_labels, thresh_preds)
        threshold_metrics[threshold] = thresh_acc

    return accuracy, report, conf_matrix, all_probs, all_labels, threshold_metrics

# Print a few examples of names and their expected labels for verification
print("\nVerification of dataset:")
for i in range(5):
    print(f"Male example {i+1}: {X_train[y_train == 1][i]}")
    print(f"Female example {i+1}: {X_train[y_train == 0][i]}")

# Train the model
print("\nStarting training...")
epochs = 15
history, model = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs)

# Evaluate the model
print("\nEvaluating model on test data...")
accuracy, report, conf_matrix, probabilities, true_labels, threshold_metrics = evaluate_model(model, test_loader)
print(f"Test Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)
print("\nAccuracy at different thresholds:")
for threshold, acc in threshold_metrics.items():
    print(f"Threshold {threshold}: {acc:.4f}")

# Plot training and validation history
plt.figure(figsize=(12, 8))

# Loss plot
plt.subplot(2, 1, 1)
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Loss over epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Accuracy plot
plt.subplot(2, 1, 2)
plt.plot(history['train_acc'], label='Training Accuracy')
plt.plot(history['val_acc'], label='Validation Accuracy')
plt.title('Accuracy over epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.show()

# Plot probability distribution
plt.figure(figsize=(10, 6))
female_probs = [1-p for p, l in zip(probabilities, true_labels) if l == 0]
male_probs = [p for p, l in zip(probabilities, true_labels) if l == 1]

plt.hist(female_probs, bins=20, alpha=0.5, label='Female names')
plt.hist(male_probs, bins=20, alpha=0.5, label='Male names')
plt.title('Probability Distribution by Gender')
plt.xlabel('Probability of being male')
plt.ylabel('Count')
plt.legend()
plt.savefig('probability_distribution.png')
plt.show()

# Function to predict gender for new names
def predict_gender(name, model, char_to_idx, max_length, threshold=0.5):
    model.eval()
    tokenized_name = tokenize_name(name, max_length)
    input_tensor = torch.tensor([tokenized_name], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        probability = output.item()
        predicted_gender = 'Male' if probability >= threshold else 'Female'
        confidence = probability if probability >= threshold else 1 - probability

    return predicted_gender, probability, confidence

# Load the best model for predictions
model.load_state_dict(torch.load('best_model.pt'))

# Example predictions with detailed output
test_names = ['Priya', 'Rahul', 'Anjali', 'Vikram', 'Aishwarya', 'Raj', 'Neha', 'Sanjay', 'Pooja']
print("\nPredictions on sample names:")
print("Name\t\tPrediction\tMale Prob\tConfidence")
print("-" * 60)
for name in test_names:
    gender, male_prob, confidence = predict_gender(name, model, char_to_idx, max_name_length)
    print(f"{name:<12}\t{gender:<10}\t{male_prob:.4f}\t\t{confidence:.4f}")

# Save the model and necessary data
torch.save({
    'model_state_dict': model.state_dict(),
    'char_to_idx': char_to_idx,
    'max_name_length': max_name_length,
    'model_config': {
        'vocab_size': vocab_size,
        'embedding_dim': embedding_dim,
        'num_filters': num_filters,
        'filter_sizes': filter_sizes,
    }
}, 'indian_name_gender_model.pt')

print("\nModel saved to 'indian_name_gender_model.pt'")

# Function to load the model and make predictions
def load_model_and_predict(model_path, names):
    # Load the saved model
    checkpoint = torch.load(model_path)

    # Get model configuration and data
    char_to_idx = checkpoint['char_to_idx']
    max_name_length = checkpoint['max_name_length']
    config = checkpoint['model_config']

    # Initialize the model
    model = NameGenderClassifierCNN(
        vocab_size=config['vocab_size'],
        embedding_dim=config['embedding_dim'],
        num_filters=config['num_filters'],
        filter_sizes=config['filter_sizes']
    )

    # Load the state dictionary
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    # Make predictions
    results = []
    for name in names:
        gender, male_prob, confidence = predict_gender(name, model, char_to_idx, max_name_length)
        results.append({
            'name': name,
            'predicted_gender': gender,
            'male_probability': male_prob,
            'confidence': confidence
        })

    return results

# Example of using the loading function
print("\nExample of loading the model and making predictions:")
sample_names = ['Deepika', 'Arjun', 'Meera', 'Rajesh']
predictions = load_model_and_predict('indian_name_gender_model.pt', sample_names)
for pred in predictions:
    print(f"{pred['name']}: {pred['predicted_gender']} (male prob: {pred['male_probability']:.4f}, confidence: {pred['confidence']:.4f})")

In [None]:
# @title testing the model
print("\nExample of loading the model and making predictions:")
sample_names = ['Deepika', 'sakshi', 'Meera', 'sameer']
predictions = load_model_and_predict('indian_name_gender_model.pt', sample_names)
for pred in predictions:
    print(f"{pred['name']}: {pred['predicted_gender']} (male prob: {pred['male_probability']:.4f}, confidence: {pred['confidence']:.4f})")