In [94]:
import pandas as pd
import numpy as np
import math
import operator

## Predict Gender from Name:
Wikidata provided pages ( Person ID ) field has many attributes to consider but the question asks to predict from name only. So, I did not fetch other attrinutes from the page. Simple approach is to create some features like the last char of first as one feature, length of first name could be another feature. Since number of features are very less, I used K-NN algorithm which is very descent for less number of features, but it is very memory intensive.

From the person ID link, one can capture more features like Description, Occupation, Country Of Origin etc.


In [95]:
# Read data into pandas data frame
data_file = pd.read_excel('all_names.xlsx')

In [96]:
print(data_file.shape)

(99999, 4)


In [97]:
data_file.head(5)

Unnamed: 0,Person ID,Person Name,Gender,Train/Test
0,http://www.wikidata.org/entity/Q117915,-minu,Male,Test
1,http://www.wikidata.org/entity/Q11515,(.)p(...)nin,Male,Train
2,http://www.wikidata.org/entity/Q23923,12th Planet,Male,Train
3,http://www.wikidata.org/entity/Q129668,2 Chainz,Male,Train
4,http://www.wikidata.org/entity/Q6060,50 Cent,Male,Train


In [98]:
#male --> 0 and female --> 1
data_file.loc[(data_file.Gender == 'Male'),'Gender']=0
data_file.loc[(data_file.Gender == 'Female'),'Gender']=1

In [99]:
data_file.sample(5)


Unnamed: 0,Person ID,Person Name,Gender,Train/Test
1942,http://www.wikidata.org/entity/Q106481,Alan Rickman,0,Test
40806,http://www.wikidata.org/entity/Q152835,Hans von Aachen,0,Train
7465,http://www.wikidata.org/entity/Q562622,Anna Månsdotter,1,Train
35044,http://www.wikidata.org/entity/Q87857,Fritz Grünbaum,0,Train
47317,http://www.wikidata.org/entity/Q467838,Ivy Matsepe-Casaburri,1,Train


### feature engineering
split person name into first name, middle name and remaining. First name maters in identifing geneder the most. 
I mean there is a coorelation between first name and target variable. So, first name is critical. I used last character of the first name as another feature and length of first name as a feature.

In [100]:
def get_first_name(name):
    name = name.lower()
    return name.split()[0]

In [101]:
#test above function
name = "A.P.J. Abdul Kalam"
print(get_first_name(name))

a.p.j.


In [102]:
def get_middle_name(name):
    name = name.lower()
    arr = name.split()
    if len(arr) > 2:
        return arr[1]
    else:
        return None

In [103]:
name = "Mistry von taylor jr."
print(get_middle_name(name))

von


In [104]:
def get_last_name(name):
    name = name.lower()
    arr = name.split()
    return arr[len(arr) - 1]

In [105]:
def get_length(name):
    return len(name)

In [106]:
data_file['First Name'] = data_file['Person Name'].apply(lambda x:get_first_name(x))

In [107]:
data_file['Middle Name'] = data_file['Person Name'].apply(lambda x:get_middle_name(x))

In [108]:
data_file['Last Name'] = data_file['Person Name'].apply(lambda x:get_last_name(x))

In [109]:
data_file['length'] = data_file['First Name'].apply(lambda x:get_length(x))

In [110]:
data_file.sample(5)

Unnamed: 0,Person ID,Person Name,Gender,Train/Test,First Name,Middle Name,Last Name,length
85624,http://www.wikidata.org/entity/Q116682,Samim,0,Train,samim,,samim,5
39706,http://www.wikidata.org/entity/Q60692,Hamza Kastrioti,0,Train,hamza,,kastrioti,5
81803,http://www.wikidata.org/entity/Q292879,Regan Lauscher,1,Test,regan,,lauscher,5
3229,http://www.wikidata.org/entity/Q86743,Alfred Blaschko,0,Train,alfred,,blaschko,6
26420,http://www.wikidata.org/entity/Q90446,Edwin Hartmann,0,Test,edwin,,hartmann,5


In [111]:
# return numeric value of last char ( 0 ... 25)
def create_gender_feature(word):
    return ord(word[-1]) - 96

In [112]:
data_file['Last char'] = data_file['First Name'].apply(lambda x:create_gender_feature(x))

In [113]:
data_file.sample(5)

Unnamed: 0,Person ID,Person Name,Gender,Train/Test,First Name,Middle Name,Last Name,length,Last char
39497,http://www.wikidata.org/entity/Q166169,Gwendolyn Osborne,1,Train,gwendolyn,,osborne,9,14
49624,http://www.wikidata.org/entity/Q9086,Jeremy Wariner,0,Train,jeremy,,wariner,6,25
39801,http://www.wikidata.org/entity/Q446300,Hanna Kvanmo,1,Train,hanna,,kvanmo,5,1
11014,http://www.wikidata.org/entity/Q275724,Audie England,1,Train,audie,,england,5,5
2976,http://www.wikidata.org/entity/Q104586,Alexandra Bujdoso,1,Train,alexandra,,bujdoso,9,1


In [114]:
training_samples = data_file.loc[data_file['Train/Test'] == 'Train']
testing_samples = data_file.loc[data_file['Train/Test'] == 'Test']

In [115]:
training_samples.head(5)

Unnamed: 0,Person ID,Person Name,Gender,Train/Test,First Name,Middle Name,Last Name,length,Last char
1,http://www.wikidata.org/entity/Q11515,(.)p(...)nin,0,Train,(.)p(...)nin,,(.)p(...)nin,12,14
2,http://www.wikidata.org/entity/Q23923,12th Planet,0,Train,12th,,planet,4,8
3,http://www.wikidata.org/entity/Q129668,2 Chainz,0,Train,2,,chainz,1,-46
4,http://www.wikidata.org/entity/Q6060,50 Cent,0,Train,50,,cent,2,-48
5,http://www.wikidata.org/entity/Q703112,A-Lin,1,Train,a-lin,,a-lin,5,14


In [116]:
testing_samples.head(5)

Unnamed: 0,Person ID,Person Name,Gender,Train/Test,First Name,Middle Name,Last Name,length,Last char
0,http://www.wikidata.org/entity/Q117915,-minu,0,Test,-minu,,-minu,5,21
6,http://www.wikidata.org/entity/Q713461,A-Mei,1,Test,a-mei,,a-mei,5,9
10,http://www.wikidata.org/entity/Q122001,A. E. Johann,0,Test,a.,e.,johann,2,-50
12,http://www.wikidata.org/entity/Q55771,A. H. Salunkhe,0,Test,a.,h.,salunkhe,2,-50
13,http://www.wikidata.org/entity/Q18219,A. J. Buckley,0,Test,a.,j.,buckley,2,-50


In [117]:
print(training_samples.shape)
print(testing_samples.shape)

(79992, 9)
(20007, 9)


In [118]:
selected_features = training_samples[['Last char','length']]
training_features = selected_features.values.tolist()
training_labels = training_samples['Gender'].to_list()

In [119]:
selected_features.head(6)

Unnamed: 0,Last char,length
1,14,12
2,8,4
3,-46,1
4,-48,2
5,14,5
7,-50,2


In [120]:
test_features = testing_samples[['Last char','length']]
testing_features = test_features.values.tolist()
testing_labels = testing_samples['Gender'].to_list()

In [121]:
# we use the feature extractor to process the names data.
#features = []
#test_features = []
#for name in training_features:
#    features.append([name])
#for feature in testing_features:
#    test_features.append([feature])    

In [122]:
#convert python array to ndarray
#features = np.array(features)
#labels = np.array(training_labels)
#test_features = np.array(test_features)
#testing_labels = np.array(testing_labels)

In [123]:
print(training_features[:10])
print(type(training_labels))

[[14, 12], [8, 4], [-46, 1], [-48, 2], [14, 5], [-50, 2], [-50, 2], [-50, 2], [-50, 2], [-50, 2]]
<class 'list'>


In [124]:
def euclidean_distance(training_instance, test_instance):
    """
    calculates distance between two points

    Parameters:
           training_instance: is a numeric feature list - training instance
           test_instance: point is a numeric feature list - test instance
    Returns:
          distance in float
   """
    
    distance = 0
    for index in range(len(training_instance)):
        distance += pow(test_instance[index] - training_instance[index],2)
    return math.sqrt(distance)

In [125]:
# Euclidean Distance Caculator for testing
data1=[2,1]
data2=[1,2]
euclidean_distance(data1,data2)

1.4142135623730951

In [126]:
def get_neighbours(training_data, training_label, test_instance, k=3):
    """
    calculates distance between two points

    Parameters:
           training_data: is a numeric feature list - training instance
           test_instance: point is a numeric feature list - test instance
    Returns:
          distance in float
   """

    distances = []
    for x in range(len(training_data)):
        training_instance = training_data[x].copy()
        distance = euclidean_distance(training_instance, test_instance)
        training_instance.extend((training_label[x],distance))
        distances.append(training_instance) 

    distances.sort(key=operator.itemgetter(3), reverse=False) 
    return distances[:k]       

In [127]:
def predict(neighbours):
    prediction = {}
    for x in range(len(neighbours)):
        response = neighbours[x][-2]
        if response in prediction:
            prediction[response] += 1
        else:
            prediction[response] = 1
    sorted_votes = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_votes[0][0]

In [128]:
#FOR TESTING the above function
#X_train = [[10,10,10], [2,2,2], [40,40,40]]
#X_test = [3,3,3]
#y_train = ['a', 'b','c']
#y_test  = ['b']

#neighbors = get_neighbours(X_train, y_train, X_test, 2)
#print(type(X))
#print(neighbors)

#predicted_value = predict(neighbors)
#print(predicted_value)
#print(X_train )

### How to apply KNN
 1. convert numpy ndarray to python list
 2. split to trainging and test set ( not required here )
 3. Measure accuracy

In [152]:
#(number of persons whose gender was correctly predicted) / (number of persons)
def accuracy_score(true_labels, predicted_labels):
    count = 0
    for i in range(len(true_labels)):
        if ( true_labels[i] == predicted_labels[i]):
            count = count+1
    return count/len(true_labels)


In [156]:
a=[1,0,1,1]
b=[1,1,1,0]
print(accuracy_score(a, b))

0.5


In [129]:
y_pred = []
for test_index in range(len(testing_features)):
    
    neighbors = get_neighbours(training_features, training_labels, testing_features[test_index], 10)
#    print(neighbors)
    y_pred.append(predict(neighbors))
#   
accuracy_score(testing_labels, y_pred)

NameError: name 'accuracy_score' is not defined

In [157]:
print(accuracy_score(testing_labels, y_pred))

0.7603838656470235


### compare results with actual SKLearn package

In [158]:
# KNN from sklearn 
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5) ## how to determine value of k??

#Train the model using the training sets
knn.fit(training_features, training_labels)

#Predict the response for test dataset
y_pred = knn.predict(testing_features)
accuracy_score(testing_labels, y_pred)

0.7901734392962463