In [31]:
!pip install tensorflow keras faker



In [None]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer, sequence
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import Callback
from faker import Faker
from sklearn.model_selection import train_test_split

In [33]:
female_data = pd.read_csv('/content/Indian-Female-Names.csv')
male_data = pd.read_csv("/content/Indian-Male-Names.csv")

In [34]:
repl_list = ['s/o','d/o','w/o','/','&',',','-']

def clean_data(name):
        name = str(name).lower()
        name = (''.join(i for i in name if ord(i)<128)).strip()
        for repl in repl_list:
                name = name.replace(repl," ")
        if '@' in name:
                pos = name.find('@')
                name = name[:pos].strip()
        name = name.split(" ")
        name = " ".join([each.strip() for each in name])
        return name

def remove_records(merged_data):
        merged_data['delete'] = 0
        merged_data.loc[merged_data['name'].str.find('with') != -1,'delete'] = 1
        merged_data.loc[merged_data['count_words']>=5,'delete']=1
        merged_data.loc[merged_data['count_words']==0,'delete']=1
        merged_data.loc[merged_data['name'].str.contains(r'\d') == True,'delete']=1
        cleaned_data = merged_data[merged_data.delete==0]
        return cleaned_data

merged_data = pd.concat((male_data,female_data),axis=0)

merged_data['name'] = merged_data['name'].apply(clean_data)
merged_data['count_words'] = merged_data['name'].str.split().apply(len)

cleaned_data = remove_records(merged_data)

indian_cleaned_data = cleaned_data[['name','count_words']].drop_duplicates(subset='name',keep='first')
indian_cleaned_data['label'] = 'indian'

len(indian_cleaned_data)

13754

In [35]:
fake = Faker("en_us")
non_indian_data_list = []
for i in range(14000):
  name = fake.name()
  non_indian_data_list.append({'name':name,'count_words':len(name.split()),'label':'non_indian'})

non_indian_data = pd.DataFrame(non_indian_data_list)
len(non_indian_data)

14000

In [36]:
all_names = pd.concat([indian_cleaned_data, non_indian_data], ignore_index=True)
train_data, test_data = train_test_split(all_names, test_size=0.2, random_state=42)

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

Training data shape: (22203, 3)
Testing data shape: (5551, 3)


In [37]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['name'])
train_sequences = tokenizer.texts_to_sequences(train_data['name'])
test_sequences = tokenizer.texts_to_sequences(test_data['name'])
max_sequence_length = 10
train_padded = sequence.pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

print(f"Shape of training sequences: {train_padded.shape}")
print(f"Shape of testing sequences: {test_padded.shape}")

Shape of training sequences: (22203, 10)
Shape of testing sequences: (5551, 10)


In [38]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_data['label'])
test_labels_encoded = label_encoder.transform(test_data['label'])
train_labels_one_hot = to_categorical(train_labels_encoded)
test_labels_one_hot = to_categorical(test_labels_encoded)

print(f"Shape of training labels: {train_labels_one_hot.shape}")
print(f"Shape of testing labels: {test_labels_one_hot.shape}")

Shape of training labels: (22203, 2)
Shape of testing labels: (5551, 2)


In [39]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dense(train_labels_one_hot.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



In [40]:
history = model.fit(train_padded, train_labels_one_hot, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.8423 - loss: 0.2694 - val_accuracy: 0.9973 - val_loss: 0.0130
Epoch 2/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9994 - loss: 0.0025 - val_accuracy: 0.9964 - val_loss: 0.0131
Epoch 3/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9996 - loss: 0.0023 - val_accuracy: 0.9944 - val_loss: 0.0173
Epoch 4/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9999 - loss: 8.0640e-04 - val_accuracy: 0.9975 - val_loss: 0.0114
Epoch 5/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9997 - loss: 0.0019 - val_accuracy: 0.9968 - val_loss: 0.0231
Epoch 6/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 1.0000 - loss: 1.1401e-04 - val_accuracy: 0.9959 - val_loss: 0.0180
Epoch 7/10
[1m556/5

In [41]:
loss, accuracy = model.evaluate(test_padded, test_labels_one_hot, verbose=0)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.0206
Test Accuracy: 0.9982


In [42]:
def predict_name_origin(names):
    cleaned_names = [clean_data(name) for name in names]
    name_sequences = tokenizer.texts_to_sequences(cleaned_names)
    name_padded = sequence.pad_sequences(name_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
    predictions = model.predict(name_padded)
    predicted_labels_encoded = np.argmax(predictions, axis=1)
    predicted_labels = label_encoder.inverse_transform(predicted_labels_encoded)
    return predicted_labels

while True:
  input_names_string = input('Enter a list of names separated by commas (or type "exit" to quit): ')
  if input_names_string.lower() == 'exit':
    break
  new_names = input_names_string.split(',')
  predicted_origins = predict_name_origin(new_names)

  for name, origin in zip(new_names, predicted_origins):
      print(f"The name '{name}' is predicted as: {origin}")

Enter a list of names separated by commas (or type "exit" to quit): nilesh
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
The name 'nilesh' is predicted as: indian
Enter a list of names separated by commas (or type "exit" to quit): exit
