In [2]:
import torch
import torch.nn as nn
from torchtext.vocab import vocab
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader

import pandas as pd 

from collections import Counter

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix


# Collect Data

In [3]:
males_names_data =pd.read_csv(r'https://gist.githubusercontent.com/mbejda/7f86ca901fe41bc14a63/raw/38adb475c14a3f44df9999c1541f3a72f472b30d/Indian-Male-Names.csv')

In [4]:
males_names_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14845 entries, 0 to 14844
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    14821 non-null  object
 1   gender  14845 non-null  object
 2   race    14845 non-null  object
dtypes: object(3)
memory usage: 348.1+ KB


In [5]:
males_names_data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [8]:
#males_names_data = males_names_data.drop_duplicates()

In [9]:
#males_names_data = males_names_data.reset_index()

In [10]:
#males_names_data = males_names_data.drop('index', axis=1)

In [11]:
#males_names_data.head()

In [12]:
#males_names_data.info()

In [6]:
def firstName(full_name):
    stop_names = ['smt', 'smt.', 'kumari','kumari.','mohd', 'mohd.','km', 'km.','ku','ku.','md','md.','mr','mr.','miss',',miss.','mrs','mrs.']
    try:
        name = full_name.split(' ')[0]
        if name in stop_names:
            return full_name.split(' ')[1]
        else:
            return full_name.split(' ')[0]
    except:
        return full_name

#males_names_data['name'] = males_names_data.apply(lambda row: firstName(row['name']), axis=1)

In [14]:
#males_names_data.head()

In [15]:
#males_names_data.info()

In [7]:
# Loading females names data:
females_names_data =pd.read_csv(r'https://gist.githubusercontent.com/mbejda/9b93c7545c9dd93060bd/raw/b582593330765df3ccaae6f641f8cddc16f1e879/Indian-Female-Names.csv')

In [8]:
females_names_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15382 entries, 0 to 15381
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    15351 non-null  object
 1   gender  15382 non-null  object
 2   race    15382 non-null  object
dtypes: object(3)
memory usage: 360.6+ KB


In [9]:
females_names_data.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [19]:
#females_names_data = females_names_data.drop_duplicates()

In [20]:
#females_names_data['name'] = females_names_data.apply(lambda row: firstName(row['name']), axis=1)

In [21]:
#females_names_data.info()

In [10]:
# Lets merge both names and create new dataframe:
data = males_names_data.append(females_names_data)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30227 entries, 0 to 15381
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    30172 non-null  object
 1   gender  30227 non-null  object
 2   race    30227 non-null  object
dtypes: object(3)
memory usage: 944.6+ KB


In [12]:
data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [13]:
data.tail()

Unnamed: 0,name,gender,race
15377,saroj devi,f,indian
15378,naina @ geeta,f,indian
15379,manju d/0 baboo lal jatav,f,indian
15380,shivani,f,indian
15381,nayna,f,indian


In [14]:
data.isnull().sum()

name      55
gender     0
race       0
dtype: int64

In [15]:
# Removing null values:
data.dropna(inplace=True)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30172 entries, 0 to 15381
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    30172 non-null  object
 1   gender  30172 non-null  object
 2   race    30172 non-null  object
dtypes: object(3)
memory usage: 942.9+ KB


**Data Analysis**

In [17]:
import plotly.express as px
gender = data.gender.value_counts()
fig = px.pie(data, values=gender.values, names=gender.index, title='Distribution of Gender')
fig.show()

In [18]:
name = data.name.value_counts()
import plotly.graph_objects as go
fig = go.Figure([go.Bar(x=name.index[:20], y=name.values[:20])])
fig.update_layout(title_text="Top 50 Repeated Names and their count")
fig.show()

In [19]:
# Splitting data into training and testing:
data=data.sample(frac=1)
train_size = int(len(data)*.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [20]:
# # Creating a vocabulary of the characters from all the given names:
all_chars = [t for text  in data['name'] for t in text if text is not None] 
char_count = Counter(all_chars)
name_char_vocab = vocab(char_count)

In [21]:
class NamesDataset(Dataset):
    
    def __init__(self,data,name_char_vocab):
        self.data=data
        self.name_char_vocab=name_char_vocab
        self.gender_dict = {'m':0, 'f':1}
        self.rev_gender_dict = {v:k for k,v in self.gender_dict.items()}
        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx, :]
        label = torch.zeros(2)
        label[self.gender_dict[item['gender']]] = 1
        name = self.get_names_tensor(item['name'])
        return name, torch.tensor(self.gender_dict[item['gender']])
    
    def get_names_tensor(self, name ):
        name_ids = self.name_char_vocab.lookup_indices([t for t in  name])
        name_tensor = torch.as_tensor(name_ids, dtype = int)
        return name_tensor
    
    def get_category_from_idx(self, idx ):
        
        return self.rev_gender_dict[idx]
    
train_ds = NamesDataset(train_data,name_char_vocab)

# Demo:
#train_ds.get_names_tensor('meet')

In [22]:
class NamesClassifier(nn.Module):
    
    def __init__(self, size):
        super(NamesClassifier, self).__init__()
        self.embedding = nn.Embedding(size,128)
        self.rnn = nn.LSTM(128,256)
        self.linear1 = nn.Linear(256,256)
        self.relu1= nn.ReLU()
        self.linear2 = nn.Linear(256,2)
    
    def forward(self, ip):
        op= self.embedding(ip)
        op, hi = self.rnn(op)
        output = self.linear1(hi[0])
        output = self.relu1(output)
        output = self.linear2(output)
        return output

In [23]:
def predict(name, model1):
    try:
        names_tensor = train_ds.get_names_tensor(name)
        output = model1(names_tensor)
        category_idx = output.topk(1)[1].item()
        category = train_ds.get_category_from_idx(category_idx)
        return category
    except:
        pass
model = NamesClassifier(len(train_ds.name_char_vocab))

In [24]:
# We need a loss function as criteria and an optimizer to train our model:

criteria = nn.CrossEntropyLoss()
optimizer =  torch.optim.Adam(model.parameters())
num_step= len(train_ds)
step =0
total_loss=0
for  i in range(0, train_size):
    try:
        name_ip, label = train_ds[i]
        step=step+1
        optimizer.zero_grad()
        op= model(name_ip)
        loss = criteria(op.squeeze(), label)
        loss.backward()
        optimizer.step()
        total_loss=loss+total_loss
        if step%1000==0:
            print(total_loss)
            total_loss=0
    except:
        pass

tensor(522.4214, grad_fn=<AddBackward0>)
tensor(462.0615, grad_fn=<AddBackward0>)
tensor(432.1827, grad_fn=<AddBackward0>)
tensor(363.9021, grad_fn=<AddBackward0>)
tensor(320.5592, grad_fn=<AddBackward0>)
tensor(329.1528, grad_fn=<AddBackward0>)
tensor(296.4897, grad_fn=<AddBackward0>)
tensor(281.1823, grad_fn=<AddBackward0>)
tensor(288.5247, grad_fn=<AddBackward0>)
tensor(249.6131, grad_fn=<AddBackward0>)
tensor(265.1039, grad_fn=<AddBackward0>)
tensor(244.7411, grad_fn=<AddBackward0>)
tensor(206.1167, grad_fn=<AddBackward0>)
tensor(234.3512, grad_fn=<AddBackward0>)
tensor(238.8071, grad_fn=<AddBackward0>)
tensor(250.4168, grad_fn=<AddBackward0>)
tensor(293.8722, grad_fn=<AddBackward0>)
tensor(257.8728, grad_fn=<AddBackward0>)
tensor(248.0378, grad_fn=<AddBackward0>)
tensor(228.6299, grad_fn=<AddBackward0>)
tensor(221.7943, grad_fn=<AddBackward0>)
tensor(230.4982, grad_fn=<AddBackward0>)
tensor(241.8092, grad_fn=<AddBackward0>)
tensor(214.5818, grad_fn=<AddBackward0>)


In [25]:
# Saving Model for later use:
# Specify a path
PATH = "state_dict_model.pt"

# Save
torch.save(model.state_dict(), PATH)

In [26]:
predicted = [predict(n, model) for n in test_data.name]

In [27]:
test_data['predicted'] = test_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
test_data['predicted'] = test_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
train_data['predicted'] = train_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
train_data['predicted'] = train_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [31]:
confusion_matrix(test_data.predicted, test_data.gender)

array([[2770,  176],
       [ 251, 2838]])

In [32]:
accuracy_score( test_data.predicted, test_data.gender )

0.9292460646230323

In [34]:
#input should be in small case
predict('lina', model)

'f'