## Import Required Libraries

In [2]:

import pandas as pd
import openpyxl
from sklearn.model_selection import train_test_split

## Load dataset

In [4]:
data= pd.read_excel("GenderPrediction (1) (1).xlsx")

In [5]:
data.head(2)

Unnamed: 0,Name,Gender,LastLetter,Unnamed: 4,.1,.2,.3,.4,.5,.6,...,.13,.14,.15,.16,.17,.18,.19,.20,.21,.22
0,Ashutosh,Male,h,,,,,,,,...,,,,,,,,,,
1,Meghamala,Female,a,,,,,,,,...,,,,,,,,,,


## Keep only relevant columns

In [6]:
data=data[['Name','Gender','LastLetter']]

In [7]:
data=data.drop(data.tail(2).index)

In [13]:

data.isna().sum()

Name          0
Gender        0
LastLetter    0
dtype: int64

## Split the Dataset (Stratified Sampling)

In [9]:
train_data,test_data= train_test_split(data,test_size=0.3,random_state =42,stratify=data['Gender'])

## Check the distribution in both sets

In [11]:
print(train_data['Gender'].value_counts())

Gender
Male      1124
Female     975
Name: count, dtype: int64


In [15]:
print(test_data['Gender'].value_counts())

Gender
Male      482
Female    418
Name: count, dtype: int64


## Apply Bayes' Theorem on Training Data

In [20]:
# Prior probability: P(Gender)
P_Gender= train_data["Gender"].value_counts(normalize=True)

# Likelihood: P(LastLetter | Gender)
likelihood=train_data.groupby(['LastLetter','Gender']).size().unstack(fill_value=0)
likelihood=likelihood.div(likelihood.sum(axis=1),axis=0)


## Gender Prediction Function

In [39]:
def predict_gender(name):
    last_letter = name[-1].lower()  # Extract last letter
    
    # Check if last letter exists in the index of likelihood DataFrame
    if last_letter not in likelihood.index.tolist():
        return "Unknown"  # Handle unseen letters
    
    # Apply Bayes' theorem
    posterior = likelihood.loc[last_letter] * P_Gender
    
    return posterior.idxmax()  # Predict gender with highest probability


In [43]:
# Test the function
print(predict_gender("Sonia"))
print(predict_gender("John"))

Female
Male


## Evaluate the Model

In [46]:
# Predict on test set
test_data["Predicted_Gender"] = test_data["Name"].apply(predict_gender)

# Accuracy calculation
accuracy=(test_data['Gender']==test_data['Predicted_Gender']).mean()
print(f'Accuracy : {accuracy*100:.2f}%')


Accuracy : 82.89%
