In [1]:
# Importing the Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [3]:
# Importing the Dataset

df = pd.read_csv('names_dataset.csv')

In [4]:
df.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [5]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [6]:
# Checking for Missing Values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [7]:
# Number of Female Names
df[df.sex == 'F'].size

181800

In [8]:
# Number of Male Names
df[df.sex == 'M'].size

103275

In [9]:
data = df[:]

In [10]:
# Replacing All F and M with 0 and 1 respectively
data.sex.replace({'F':0,'M':1},inplace=True)

In [11]:
X = data.name
y = data.sex

In [12]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(X)

In [13]:
# Save GenderVectorizer

from sklearn.externals import joblib
gender_vectorizer = open("gender_vectorizer.pkl","wb")
joblib.dump(cv,gender_vectorizer)
gender_vectorizer.close()

In [14]:
# Splitting the Model

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .3, random_state = 42)

In [15]:
# Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
nv_clf = MultinomialNB()
nv_clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
nv_clf.score(X_test,y_test)

0.6406622702399326

In [17]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()
# Female is 0, Male is 1
nv_clf.predict(vect)

array([0], dtype=int64)

In [18]:
# Dump the Model
gender_nv_model = open("gender_nv_model.pkl","wb")
joblib.dump(nv_clf,gender_nv_model)
gender_nv_model.close()

In [35]:
# Sample1 Prediction
sample_name = ["Ravi"]
vect = cv.transform(sample_name).toarray()
# Female is 0, Male is 1
nv_clf.predict(vect)

array([1], dtype=int64)

In [37]:
# Logistic Regression Model

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test,y_test)

0.6406622702399326

In [39]:
# Dump the Model

logitModel = open("gender_logit_model.pkl","wb")
joblib.dump(lr,logitModel)
logitModel.close()