### Naive bayes classifier

In [71]:
import pathlib 

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from ipywidgets import interact_manual
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [81]:
df = pd.DataFrame(columns=["state", "gender", "year", "name", "number"])
for path in pathlib.Path('data').glob('*.TXT'):
    state = pd.read_csv(path, names=["state", "gender", "year", "name", "number"], header=None)
    df = df.append(state)
df = df.astype({'number':'int64'})
df.head()

Unnamed: 0,state,gender,year,name,number
0,IN,F,1910,Mary,619
1,IN,F,1910,Helen,324
2,IN,F,1910,Ruth,238
3,IN,F,1910,Dorothy,215
4,IN,F,1910,Mildred,200


In [9]:
#https://www.kaggle.com/springboardroger/naive-bayes-name-gender-classifier
names_by_gender = df.groupby(['name', 'gender'], as_index=False).agg({'number':sum})
names_by_gender.head(5)

Unnamed: 0,name,gender,number
0,Aaban,M,12
1,Aadan,M,23
2,Aadarsh,M,5
3,Aaden,M,4114
4,Aadhav,M,37


In [12]:
names_by_gender = names_by_gender.pivot('name', 'gender', 'number')
names_by_gender = names_by_gender.reset_index().fillna(0)
names_by_gender.head()

gender,name,F,M
0,Aaban,0.0,12.0
1,Aadan,0.0,23.0
2,Aadarsh,0.0,5.0
3,Aaden,0.0,4114.0
4,Aadhav,0.0,37.0


In [13]:
#Data cleaning
names_by_gender["Mpercent"] = ((names_by_gender["M"] - names_by_gender["F"])/(names_by_gender["M"] + names_by_gender["F"]))
names_by_gender['gender'] = np.where(names_by_gender['Mpercent'] > 0.001, 'male', 'female')
names_by_gender.set_index("name", inplace=True)
names_by_gender.head()

gender,F,M,Mpercent,gender
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaban,0.0,12.0,1.0,male
Aadan,0.0,23.0,1.0,male
Aadarsh,0.0,5.0,1.0,male
Aaden,0.0,4114.0,1.0,male
Aadhav,0.0,37.0,1.0,male


In [76]:
char_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
X = char_vectorizer.fit_transform(names_by_gender.index)

#Convert this matrix to Compressed Sparse Column format
X = X.tocsc()
Y = (names_by_gender.gender == 'male').values.astype(np.int)

<1x568 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Column format>

In [77]:
char_vectorizer.inverse_transform(X[0])

[array(['aa', 'ab', 'an', 'ba'], dtype='<U2')]

In [78]:
#Use k-fold cross validation instead
#https://stackoverflow.com/questions/16123572/k-fold-cross-validation-for-naive-bayes-classifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7)

In [79]:
clf = MultinomialNB(alpha = 1)
clf.fit(X_train, Y_train)
training_accuracy = clf.score(X_train,Y_train)
test_accuracy = clf.score(X_test,Y_test)
        
print(training_accuracy)
print(test_accuracy)

0.733496111412552
0.7280303829517881


In [80]:
@interact_manual
def predict(x="Roger"):
    new = char_vectorizer.transform([x])
    y_pred = clf.predict(new)
    if (y_pred == 1):
        print("This is most likely a male name!")
    else:
        print("This is most likely a female name!")

interactive(children=(Text(value='Roger', description='x'), Button(description='Run Interact', style=ButtonSty…

### Char-RNN

In [29]:
#https://github.com/prdeepakbabu/Python/blob/master/Deep%20learning%20gender/Deep%20Learning%20(RNN%20-%20LSTMs)%20Predict%20Gender%20from%20Name.ipynb

In [None]:
#https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044

In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [None]:
class Names(Dataset):
    def __init__(self, name):
        

In [None]:
class GenderRNN(nn.Module):
    def __init__(self, in_dim, out_dim):
        self.model = nn.Sequential(nn.LSTM(in_dim, ..),
                                nn.Dropout(0.2),
                                nn.LSTM(),
                                nn.Dropoout(0.2),
                                nn.Linear(128, 10))
    def forward(self, x):
        return self.model(x)
        