In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv("/content/Names_dataset.csv")

In [3]:
df.head()

Unnamed: 0,name,gender
0,alfiya,f
1,ardwin,m
2,henryka,f
3,preeti,f
4,jamaro,m


In [4]:
df.size

250462

In [5]:
df.columns

Index(['name', 'gender'], dtype='object')

In [6]:
df.dtypes

Unnamed: 0,0
name,object
gender,object


In [7]:
# Checking for missing value
df.isnull().isnull().sum()

Unnamed: 0,0
name,0
gender,0


In [8]:
# Number of female names
df[df.gender == 'f'].size

151942

In [9]:
# Number of male names
df[df.gender == 'm'].size

98520

In [10]:
df_names = df

In [11]:
# Replacing all f to 0 and m to 1
df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)
  df_names.gender.replace({'f': 0, 'm': 1}, inplace=True)


In [12]:
df_names.head()

Unnamed: 0,name,gender
0,alfiya,0
1,ardwin,1
2,henryka,0
3,preeti,0
4,jamaro,1


In [13]:
df_names.gender.unique()

array([0, 1])

In [14]:
df_names['gender'].unique()

array([0, 1])

In [15]:
df_names.dtypes

Unnamed: 0,0
name,object
gender,int64


In [16]:
Xfeatures = df_names['name']

In [17]:
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures.values.astype('U'))

In [18]:
X.shape

(125231, 101785)

In [19]:
gender_vectorizer = open("gender_vectorizer.pkl", "wb")
joblib.dump(cv, gender_vectorizer)

In [20]:
gender_vectorizer.close()

In [21]:
cv.get_feature_names_out()

array(['aaban', 'aabha', 'aabid', ..., 'सर', 'सलम', 'हन'], dtype=object)

In [22]:
y=df_names.gender

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7104643270651175

In [25]:
print("Accuracy of Model-> {} %".format(clf.score(X_test, y_test)*100))

Accuracy of Model-> 71.04643270651175 %


In [26]:
print("Accuracy of Model-> {} %".format(clf.score(X_train, y_train)*100))

Accuracy of Model-> 99.13758684021401 %


In [27]:
# Prediction

In [28]:
sample_names = ["Suchitra"]
vect = cv.transform(sample_names).toarray()
clf.predict(vect)

array([0])

In [29]:
sample_names2 = ["Natasha"]
vect = cv.transform(sample_names2).toarray()
clf.predict(vect)

array([0])

In [30]:
sample_names3 = ['Chandu','Suchitra','Nasha','Puja','Kabir','Joseph','Virat']
vect = cv.transform(sample_names3).toarray()
clf.predict(vect)

array([1, 0, 0, 0, 1, 1, 1])

In [None]:
def gender_predictor(name):
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        return "Your Gender is Female"
    else:
        return "Your Gender is Male"

In [32]:
for i in sample_names3:
    print(i, "->", gender_predictor(i))

Chandu -> Male
Suchitra -> Female
Nasha -> Female
Puja -> Female
Kabir -> Male
Joseph -> Male
Virat -> Male


In [33]:
# Custom Feature Analysis
# By analogy most female names ends in A or I or has the sound of A

def features(name):
    name = str(name)
    name = name.lower()
    return {
        'first-letter': name[0],
        'first2-letter': name[0:2],
        'first3-letter': name[0:3],
        'last-letter':name[-1],
        'last2-letter':name[-2:],
        'last3-letter':name[-3:],
    }

In [34]:
features = np.vectorize(features)
features(['Chandu','Suchitra','Nasha','Puja','Kabir','Joseph','Virat'])

array([{'first-letter': 'c', 'first2-letter': 'ch', 'first3-letter': 'cha', 'last-letter': 'u', 'last2-letter': 'du', 'last3-letter': 'ndu'},
       {'first-letter': 's', 'first2-letter': 'su', 'first3-letter': 'suc', 'last-letter': 'a', 'last2-letter': 'ra', 'last3-letter': 'tra'},
       {'first-letter': 'n', 'first2-letter': 'na', 'first3-letter': 'nas', 'last-letter': 'a', 'last2-letter': 'ha', 'last3-letter': 'sha'},
       {'first-letter': 'p', 'first2-letter': 'pu', 'first3-letter': 'puj', 'last-letter': 'a', 'last2-letter': 'ja', 'last3-letter': 'uja'},
       {'first-letter': 'k', 'first2-letter': 'ka', 'first3-letter': 'kab', 'last-letter': 'r', 'last2-letter': 'ir', 'last3-letter': 'bir'},
       {'first-letter': 'j', 'first2-letter': 'jo', 'first3-letter': 'jos', 'last-letter': 'h', 'last2-letter': 'ph', 'last3-letter': 'eph'},
       {'first-letter': 'v', 'first2-letter': 'vi', 'first3-letter': 'vir', 'last-letter': 't', 'last2-letter': 'at', 'last3-letter': 'rat'}],
     

In [35]:
df_X = features(df_names['name'])

In [36]:
df_y = df_names['gender']

In [37]:
corpus = features(["Chandu", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
tranformed  = dv.transform(corpus)

In [38]:
print(tranformed)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 11)	1.0
  (1, 1)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0


In [39]:
dv.get_feature_names_out()

array(['first-letter=c', 'first-letter=j', 'first2-letter=ch',
       'first2-letter=ju', 'first3-letter=cha', 'first3-letter=jul',
       'last-letter=a', 'last-letter=u', 'last2-letter=du',
       'last2-letter=ia', 'last3-letter=lia', 'last3-letter=ndu'],
      dtype=object)

In [40]:
dfX_train, dfX_test, dfy_train ,dfy_test = train_test_split(df_X, df_y, test_size = 0.2, random_state=42)

In [41]:
dfX_train

array([{'first-letter': 't', 'first2-letter': 'te', 'first3-letter': 'tem', 'last-letter': 'a', 'last2-letter': 'ca', 'last3-letter': 'eca'},
       {'first-letter': 'm', 'first2-letter': 'ma', 'first3-letter': 'mar', 'last-letter': 'd', 'last2-letter': 'id', 'last3-letter': 'rid'},
       {'first-letter': 'a', 'first2-letter': 'av', 'first3-letter': 'ava', 'last-letter': 'e', 'last2-letter': 'se', 'last3-letter': 'ose'},
       ...,
       {'first-letter': 'n', 'first2-letter': 'ny', 'first3-letter': 'nya', 'last-letter': 'l', 'last2-letter': 'al', 'last3-letter': 'yal'},
       {'first-letter': 't', 'first2-letter': 'ti', 'first3-letter': 'tin', 'last-letter': 'a', 'last2-letter': 'ya', 'last3-letter': 'iya'},
       {'first-letter': 'a', 'first2-letter': 'ab', 'first3-letter': 'abb', 'last-letter': 'l', 'last2-letter': 'el', 'last3-letter': 'ael'}],
      dtype=object)

In [42]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<100184x9284 sparse matrix of type '<class 'numpy.float64'>'
	with 601104 stored elements in Compressed Sparse Row format>

In [43]:
from sklearn.tree import DecisionTreeClassifier
dclf = DecisionTreeClassifier()
x_features = dv.transform(dfX_train)
dclf.fit(x_features, dfy_train)

In [48]:
sample_name_eg = ["Esha"]
transform_dv = dv.transform(features(sample_name_eg)).toarray()
dclf.predict(transform_dv)

array([0])

In [50]:
name_eg1 = ["Ibrahim"]
transform_dv_1 = dv.transform(features(name_eg1)).toarray()
vect_1 = dclf.predict(transform_dv_1)
if vect_1==0:
    print("Female")
else:
    print("Male")

Male


In [51]:
decistion_tree = open("decisiontree.pkl","wb")

In [52]:
joblib.dump(dclf, decistion_tree)

In [53]:
import pickle
decistion_tree_01 = open("name_detector_model.pkl","wb")
pickle.dump(dclf, decistion_tree_01)
decistion_tree_01.close()

In [54]:
naive_bayes =  open("naivebayes.pkl","wb")
joblib.dump(clf, naive_bayes)
naive_bayes.close()