<a href="https://colab.research.google.com/github/roannarum/Data-Practice/blob/main/Gender_Classification_of_Names_With_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Klasifikasi Jenis Kelamin Berdasarkan Nama
### Menggunakan Machine Learning untuk memprediksi jenis kelamin berdasarkan nama
+ Sklearn
+ Pandas
+ Text Extraction

In [None]:
# EDA packages
import pandas as pd
import numpy as np


In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Load data
df = pd.read_csv('names_dataset.csv')

In [None]:
df.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [None]:
df.size

285075

In [None]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [None]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [None]:
# Checking for Missing Values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [None]:
# Number of Female Names
df[df.sex == 'F'].size

181800

In [None]:
# Number of Male Names
df[df.sex == 'M'].size

103275

In [None]:
df_names = df

In [None]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [None]:
df_names.sex.unique()

array([0, 1])

In [None]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [None]:
Xfeatures =df_names['name']

In [None]:
# Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [None]:
# Save Vectorizer
from sklearn.externals import joblib

In [None]:
gender_vectorizer = open("gender_vectorizer.pkl","wb")
joblib.dump(cv,gender_vectorizer)

In [None]:
gender_vectorizer.close()

In [None]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Features
X
# Labels
y = df_names.sex

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.6398163206734908

In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 63.98163206734908 %


In [None]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 100.0 %


### Sample Prediction

In [None]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [None]:
vect

array([[0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Female is 0, Male is 1
clf.predict(vect)

array([0])

In [None]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [None]:
clf.predict(vect1)

array([1])

In [None]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [None]:
clf.predict(vect2)

array([0])

In [None]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [None]:
clf.predict(vect3)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [None]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")


In [None]:
genderpredictor("Martha")

Female


- Features fxn
- apply the fxn
- vectorizer
- fit
- transform
- classifier
- fit
- predict


In [None]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


### Saving Our Model

##### Save Multinomial NB Model

In [None]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [None]:
joblib.dump(clf,NaiveBayesModel)

In [None]:
NaiveBayesModel.close()