In [1]:
import pandas as pd
import re

In [2]:
s_loc = './data/sample/'
d_sets = ['Indian-Male-Names.csv','Indian-Female-Names.csv']
d_sets

['Indian-Male-Names.csv', 'Indian-Female-Names.csv']

In [3]:
map_info = {'gender': {0:'Male',1:'Female'}, 'race' : {0:'black',1:'hispanic',2:'white',3:'indian'}}
map_info

{'gender': {0: 'Male', 1: 'Female'},
 'race': {0: 'black', 1: 'hispanic', 2: 'white', 3: 'indian'}}

In [8]:
ind_m = pd.read_csv(s_loc+d_sets[0])
ind_f = pd.read_csv(s_loc+d_sets[1])

In [87]:
ind_m.dropna(how='any',inplace=True)
ind_f.dropna(inplace=True)

In [88]:
ind = ind_m[:6800].append(ind_f)
ind.head()

Unnamed: 0,name,gender,race,length,fl_is_v,ll_is_v,first_2,last_2,first_3,last_3
0,barjraj,0,3,7,0,0,ba,aj,bar,raj
1,ramdin verma,0,3,12,0,1,ra,ma,ram,rma
2,sharat,0,3,6,0,0,sh,at,sha,rat
3,birender,0,3,8,0,0,bi,er,bir,der
4,amit,0,3,4,1,0,am,it,ami,mit


In [89]:
ind.gender.value_counts()

0    6800
1    5684
Name: gender, dtype: int64

In [90]:
len(ind)

12484

## Actual value

In [12]:
y = ind.gender
len(y)

12485

In [13]:
X = ind.name.values
type(X)

numpy.ndarray

## Test-Train split

In [23]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [36]:
X_train.shape

(9988,)

In [37]:
y_test.shape

(2497,)

## Features

In [63]:
def preprocess(word):
    return re.sub(r'[^a-z.\s]+','',str(word))

def name_to_list(names):
    return list(map(lambda x: str(x),names))

In [46]:
names_v = list(map(lambda x: preprocess(x),X_train))
#names_v = list(X_train)

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
vec = CountVectorizer(lowercase=True,token_pattern='.')

In [49]:
name_vec = vec.fit_transform(names_v)

In [50]:
name_vec

<9988x27 sparse matrix of type '<class 'numpy.int64'>'
	with 77906 stored elements in Compressed Sparse Row format>

In [51]:
print(vec.vocabulary_)

{'m': 13, 'a': 1, 'd': 4, 'h': 8, 'o': 15, 'r': 18, 'b': 2, 's': 19, 'u': 21, 'j': 10, 'e': 5, 't': 20, ' ': 0, 'i': 9, 'n': 14, 'g': 7, 'c': 3, 'v': 22, 'l': 12, 'k': 11, 'f': 6, 'y': 25, 'p': 16, 'w': 23, 'z': 26, 'q': 17, 'x': 24}


## Utilities

In [77]:
def name_prep(word):
    #te = re.sub(r'/.*|@.*','',word).strip()
    te = str(word).lower()
    te = re.sub(r'[^a-z.\s].*','',te).strip()
    te = re.sub(r'[^a-z ]+','',te).strip()
    te = re.sub(r' +',' ',te).strip()
    te = re.sub(r'along.*|with.*|and.*',' ',te).strip()
    #te = re.sub(r'\b\w$|\b\w\w$','',te).strip()
    te = re.sub(r'\b\w\b|\b\w\w\b','',te).strip()
    #te = re.sub(r'\bna\b|\bNaN\b','',te).strip()
    if(len(te) < 3):
        return None
    else:
        return te

In [78]:
def conv_input(name,vectorizer):
    pre = name
    return vectorizer.transform([pre])

In [79]:
def predict_from_name(model,name,vectorizer):
    pre = name_prep(name)
    if(pre is None):
        print("Invalid !!!")
        return
    else:
        p = model.predict(conv_input(pre,vectorizer).toarray())
        print("Name : ",name,"\n","Gender :",map_info['gender'][p[0]])

## Model

In [15]:
train_score = {}
test_score = {}

* Naive Bayes

In [66]:
model_1 = 'NavieB'

In [91]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score

In [32]:
from sklearn.naive_bayes import GaussianNB

In [56]:
naive_b = GaussianNB()

In [58]:
naive_b.fit(name_vec.toarray(),y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [59]:
p = naive_b.predict(conv_input('Narendra Modi',vec).toarray())

In [60]:
map_info['gender'][p[0]]

'Male'

In [69]:
test_score[model_1] = naive_b.score(vec.transform(name_to_list(X_test)).toarray(),y_test)
test_score

{'NavieB': 0.6199439327192631}

In [70]:
train_score[model_1] = naive_b.score(name_vec.toarray(),y_train)
train_score

{'NavieB': 0.6259511413696436}

In [80]:
predict_from_name(naive_b,'Narendra Modi',vec)

Name :  Narendra Modi 
 Gender : Male


* Random Forest

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
model_2 = "RandomForest"

In [84]:
rf = RandomForestClassifier()

In [85]:
rf.fit(name_vec,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [86]:
test_score[model_2] = rf.score(vec.transform(name_to_list(X_test)).toarray(),y_test)
test_score

{'NavieB': 0.6199439327192631, 'RandomForest': 0.6936323588305967}

In [87]:
train_score[model_2] = rf.score(name_vec.toarray(),y_train)
train_score

{'NavieB': 0.6259511413696436, 'RandomForest': 0.9582498998798559}

## With custom features

In [92]:
from sklearn.feature_extraction import DictVectorizer

In [93]:
m_result = {'train_score' : {},'test_score' : {}}

### Test-Train split

In [94]:
ind.head()

Unnamed: 0,name,gender,race,length,fl_is_v,ll_is_v,first_2,last_2,first_3,last_3
0,barjraj,0,3,7,0,0,ba,aj,bar,raj
1,ramdin verma,0,3,12,0,1,ra,ma,ram,rma
2,sharat,0,3,6,0,0,sh,at,sha,rat
3,birender,0,3,8,0,0,bi,er,bir,der
4,amit,0,3,4,1,0,am,it,ami,mit


In [95]:
X = ind.iloc[:,3:]
X.columns

Index(['length', 'fl_is_v', 'll_is_v', 'first_2', 'last_2', 'first_3',
       'last_3'],
      dtype='object')

In [96]:
actual_y = ind.gender

In [97]:
X_train, X_test, ay_train, ay_test = train_test_split(X,actual_y,test_size=0.2)

In [98]:
custom_vec = DictVectorizer()

In [99]:
train_features = custom_vec.fit_transform(X_train.to_dict(orient='records')) # convert dataframe to dict like records

In [100]:
X_train.to_dict(orient='records')

[{'length': 17,
  'fl_is_v': 0,
  'll_is_v': 1,
  'first_2': 'da',
  'last_2': 'pa',
  'first_3': 'dal',
  'last_3': 'apa'},
 {'length': 13,
  'fl_is_v': 0,
  'll_is_v': 0,
  'first_2': 'si',
  'last_2': 'un',
  'first_3': 'sit',
  'last_3': 'tun'},
 {'length': 12,
  'fl_is_v': 0,
  'll_is_v': 0,
  'first_2': 'ha',
  'last_2': 'ad',
  'first_3': 'haj',
  'last_3': 'mad'},
 {'length': 8,
  'fl_is_v': 0,
  'll_is_v': 0,
  'first_2': 'ha',
  'last_2': 'er',
  'first_3': 'har',
  'last_3': 'der'},
 {'length': 11,
  'fl_is_v': 0,
  'll_is_v': 1,
  'first_2': 'ma',
  'last_2': 'ra',
  'first_3': 'man',
  'last_3': 'ura'},
 {'length': 10,
  'fl_is_v': 0,
  'll_is_v': 1,
  'first_2': 'su',
  'last_2': 'vi',
  'first_3': 'sur',
  'last_3': 'evi'},
 {'length': 11,
  'fl_is_v': 0,
  'll_is_v': 1,
  'first_2': 'ra',
  'last_2': 'gi',
  'first_3': 'ram',
  'last_3': 'ogi'},
 {'length': 6,
  'fl_is_v': 0,
  'll_is_v': 0,
  'first_2': 'sh',
  'last_2': 'az',
  'first_3': 'sha',
  'last_3': 'waz'},
 {

In [101]:
train_features

<9987x2959 sparse matrix of type '<class 'numpy.float64'>'
	with 55663 stored elements in Compressed Sparse Row format>

* Naive Bayes Classifier

In [102]:
from sklearn.naive_bayes import MultinomialNB

In [108]:
nb_model = MultinomialNB()

In [109]:
len(ay_train)

9987

In [113]:
nb_model.fit(train_features,ay_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [114]:
nb_model.score(train_features,ay_train)

0.8426955041554021

In [115]:
nb_model.score(custom_vec.transform(X_test.to_dict(orient='records')),ay_test)

0.7997597116539847

In [122]:
f1_score(ay_test,nb_model.predict(custom_vec.transform(X_test.to_dict(orient='records'))))

0.7795414462081128

In [126]:
confusion_matrix(ay_test,nb_model.predict(custom_vec.transform(X_test.to_dict(orient='records'))))

array([[1113,  250],
       [ 250,  884]])

In [116]:
from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()

In [117]:
dclf.fit(train_features,ay_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [118]:
dclf.score(train_features,ay_train)

0.9778712326023831

In [119]:
dclf.score(custom_vec.transform(X_test.to_dict(orient='records')),ay_test)

0.7741289547456949

In [127]:
confusion_matrix(ay_test,dclf.predict(custom_vec.transform(X_test.to_dict(orient='records'))))

array([[1115,  248],
       [ 316,  818]])