# Naive Bayes

Naive Bayes is an generative algorithm which is used here for binary classification. Given a name predict whether the name is a boy's name or a girl's name.

## Naive Bayes Assumption : 
All the dimensions of data points are independent of each other.



In [1]:
import numpy as np
import pandas as pd

In [2]:
def NameToFeatureVector(name,dims,chunk_count):
    vec = np.zeros(dims,dtype=np.int8)
    letter_vec = np.zeros(26,dtype = np.int8)
    
    for i in range(-1,max(-3,-len(name)),-1):
        c = name[i]
        c = c.lower()
        letter_vec[ord(c)-ord('a')] = 1

    for i in range(chunk_count):
        feature_string = 'prefix' + name[:i]
        vec[hash(feature_string) % dims] = 1
        feature_string = 'suffix' + name[-i:]
        vec[hash(feature_string) % dims] = 1
        feature_string = 'middle' + name[i+3:i+5]
        vec[hash(feature_string) % dims] = 1

    vec = np.concatenate([letter_vec,vec])

    return vec

def GenerateFeatureVector(names,dims=100,chunk_count=3):
    X = np.zeros((len(names),dims+26),dtype=np.int8)
    for i in range(len(names)):
        X[i,:] = NameToFeatureVector(names[i],dims,chunk_count)
    return X

def PrepareDataset(filepath,no_of_examples_for_training=2000000):
    data = pd.read_csv(filepath)
    names = list(data['Name'])
    genders = np.array(data['Gender'])
    X = GenerateFeatureVector(names)
    Y = np.ones(genders.shape[0],dtype=np.int8)
    Y[genders == 'F'] = -1
    ii = np.random.permutation([i for i in range(len(Y))])
    X = X[ii,:]
    Y = Y[ii]
    return X[:no_of_examples_for_training],Y[:no_of_examples_for_training]

# Marginal Probablities

## 1=Boy(pos) , -1 = Girl(neg)

In [3]:
def NaiveBayesPY(Y):
    #smoothing..
    Y = np.concatenate([Y, [-1,1]])
    pos = np.count_nonzero(Y == 1)/len(Y)
    neg = 1-pos
    return pos,neg

# Conditional Probablity
Since it is a generative model. Here we are calculating P(Xi|Y) and multiply the probablity of all the dimensions.

In [4]:
def NaiveBayesPXY(X,Y):
    n,d = X.shape
    # smoothing...
    X = np.concatenate([X,np.ones((2,d)),np.zeros((2,d))])
    Y = np.concatenate([Y,[1,-1,1,-1]])
    
    n,d = X.shape
    
    pos = np.where(Y == 1)[0]
    X_pos = X[pos]
    X_sum = np.sum(X_pos,axis=0)
    posprob = np.divide(X_sum,pos.shape[0])
    
    neg = np.where(Y == -1)[0]
    X_neg = X[neg]
    X_sum = np.sum(X_neg,axis=0)
    negprob = np.divide(X_sum,neg.shape[0])
    
    return posprob, negprob

### Take log of all the probablities to avoid the precision issues because of multiplication of smaller numbers

In [5]:
def LogLikelihood(posprob,negprob,X_test,Y_test):
    
    n,d = X_test.shape
    loglikelihood = np.zeros(n)
    
    for i in range(n):
        vec = X_test[i,:]
        feature_1 = np.where(vec == 1)[0]
        feature_0 = np.where(vec == 0)[0]
        if(Y_test[i] == 1):
            likelihood = np.sum(np.log(posprob[feature_1]))
            likelihood += np.sum(np.log(1 - posprob[feature_0]))
        else:
            likelihood = np.sum(np.log(negprob[feature_1]))
            likelihood += np.sum(np.log(1 - negprob[feature_0]))
        loglikelihood[i] = likelihood
    return loglikelihood

### Predict the label for a given name

In [6]:
def NaiveBayesPredict(pos,neg,posprob,negprob,X_test):
    n,d = X_test.shape
    pos_label = np.ones(n)
    neg_label = np.full((n),-1)
    preds = np.ones(n)
    loglikelihood_pos = LogLikelihood(posprob,negprob,X_test,pos_label)
    loglikelihood_neg = LogLikelihood(posprob,negprob,X_test,neg_label)
    
    for i in range(n):
        pos_prob = loglikelihood_pos[i] + np.log(pos)
        neg_prob = loglikelihood_neg[i] + np.log(neg)    
        
        if(pos_prob - neg_prob < 0):
            preds[i] = -1
    return preds
    

In [7]:
print('loading data...')
X,Y = PrepareDataset('dataset/data.csv')
print('data loaded...')
print('size of data=',X.shape)

loading data...
data loaded...
size of data= (2000000, 126)


In [8]:
print('training classifier')
pos,neg = NaiveBayesPY(Y)
print('boys prob:%.2f' % pos,', and girl prob:%.2f' % neg)
posprob,negprob = NaiveBayesPXY(X,Y)
error = np.mean(NaiveBayesPredict(pos, neg, posprob, negprob, X) != Y)
print('Training error: %.2f%%' % (100 * error))

training classifier
boys prob:0.44 , and girl prob:0.56
Training error: 23.98%


In [9]:
while True:
    print('Please enter a baby name>')
    yourname = input()
    if len(yourname) < 1:
        break
    xtest = GenerateFeatureVector([yourname])
    pred = NaiveBayesPredict(pos, neg, posprob, negprob, xtest)
    
    if pred[0] > 0:
        print("%s, I am sure you are a baby boy.\n" % yourname)
    else:
        print("%s, I am sure you are a baby girl.\n" % yourname)

Please enter a baby name>
john
john, I am sure you are a baby boy.

Please enter a baby name>
kira
kira, I am sure you are a baby girl.

Please enter a baby name>
chris
chris, I am sure you are a baby boy.

Please enter a baby name>
scott
scott, I am sure you are a baby girl.

Please enter a baby name>

