In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
import operator
import matplotlib.pyplot as plt
from openpyxl import load_workbook
from string import ascii_lowercase

In [132]:
df = pd.read_excel("MergedData.xlsx")
df = df.replace({'Male':0}, regex=True)
df = df.replace({'Female':1}, regex=True)
df = df.dropna()

In [117]:
dfFeatures = df

# A ending?
dfFeatures['A ending?'] = 0
dfFeatures.loc[(dfFeatures['Name'].str.endswith('a', na=False)), 'A ending?'] = 1

# Plosive ending? (p,b,t,d,k,g)
dfFeatures["Plosive ending? (p,b,t,d,k,g)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.endswith('p', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('b', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('t', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('d', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('k', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('g', na=False)), "Plosive ending? (p,b,t,d,k,g)"] = 1

# Sonorant ending? (m,n,ng,l,r)
dfFeatures["Sonorant ending? (m,n,ng,l,r)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.endswith('m', na=False)), "Sonorant ending? (m,n,ng,l,r)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('n', na=False)), "Sonorant ending? (m,n,ng,l,r)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('ng', na=False)), "Sonorant ending? (m,n,ng,l,r)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('l', na=False)), "Sonorant ending? (m,n,ng,l,r)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('r', na=False)), "Sonorant ending? (m,n,ng,l,r)"] = 1

# Fricative ending? (f,v,th,s,z,sh,ch,dge)
dfFeatures["Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.endswith('f', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('v', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('th', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('s', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('z', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('sh', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('dge', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('ch', na=False)), "Fricative ending? (f,v,th,s,z,sh,ch,dge)"] = 1

# 1st letter fricative? (f,v,th,s,z,sh,ch,j)
dfFeatures["1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.startswith('f', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('v', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('th', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('s', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('z', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('sh', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('ch', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('j', na=False)), "1st letter fricative? (f,v,th,s,z,sh,ch,j)"] = 1

# Total # of sonorants (m,n,l,r,w,y)
dfFeatures["Total # of sonorants (m,n,l,r,w,y)"] = dfFeatures.Name.str.count("[mnlrwy]")

# Total # of vowels (a, e, i, o, u)
dfFeatures["Total # of vowels (a, e, i, o, u)"] = dfFeatures.Name.str.count("[aeiou]")

# 1st letter plosive? (p,b,t,d,k,g)
dfFeatures["1st letter plosive? (p,b,t,d,k,g)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.startswith('p', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('b', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('t', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('d', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('k', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith('g', na=False)), "1st letter plosive? (p,b,t,d,k,g)"] = 1

# Total # of plosives  (p,b,t,d,k,g)
dfFeatures["Total # of plosives  (p,b,t,d,k,g)"] = dfFeatures.Name.str.count("[pbtdkg]")

# VowelEnding? (a, e, i, o, u,y)
dfFeatures["VowelEnding? (a, e, i, o, u,y)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.endswith('a', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('e', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('i', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('o', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('u', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.endswith('y', na=False)), "VowelEnding? (a, e, i, o, u,y)"] = 1

# # of letters
dfFeatures["# of letters"] = dfFeatures.Name.str.len()

# Total # of fricatives (f,v,th,s,z,sh,ch,j)
dfFeatures["# of fricatives (f,v,th,s,z,sh,ch,j)"] = (dfFeatures.Name.str.count("[fvzj]") + dfFeatures.Name.str.count("(sh)|s") 
                                                      + dfFeatures.Name.str.count("(th)") + dfFeatures.Name.str.count("(ch)"))

# 1st letter vowel? (a, e, i, o, u)
dfFeatures["1st letter vowel? (a, e, i, o, u)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.startswith("a", na=False)), "1st letter vowel? (a, e, i, o, u)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("e", na=False)), "1st letter vowel? (a, e, i, o, u)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("i", na=False)), "1st letter vowel? (a, e, i, o, u)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("o", na=False)), "1st letter vowel? (a, e, i, o, u)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("u", na=False)), "1st letter vowel? (a, e, i, o, u)"] = 1

# 1st letter sonorant? (m,n,l,r,w,y)
dfFeatures["1st letter sonorant? (m,n,l,r,w,y)"] = 0
dfFeatures.loc[(dfFeatures['Name'].str.startswith("m", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("n", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("l", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("r", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("w", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1
dfFeatures.loc[(dfFeatures['Name'].str.startswith("y", na=False)), "1st letter sonorant? (m,n,l,r,w,y)"] = 1

dfFeatures.dropna() #97103

Unnamed: 0,Name,Gender,dataID,A ending?,"Plosive ending? (p,b,t,d,k,g)","Sonorant ending? (m,n,ng,l,r)","Fricative ending? (f,v,th,s,z,sh,ch,dge)","1st letter fricative? (f,v,th,s,z,sh,ch,j)","Total # of sonorants (m,n,l,r,w,y)","Total # of vowels (a, e, i, o, u)","1st letter plosive? (p,b,t,d,k,g)","Total # of plosives (p,b,t,d,k,g)","VowelEnding? (a, e, i, o, u,y)",# of letters,"# of fricatives (f,v,th,s,z,sh,ch,j)","1st letter vowel? (a, e, i, o, u)","1st letter sonorant? (m,n,l,r,w,y)"
0,aaden,0,0,0,0,1,0,0,1.0,3.0,0,1.0,0,5.0,0.0,1,0
1,aadhya,1,0,1,0,0,0,0,1.0,3.0,0,1.0,1,6.0,0.0,1,0
2,aaliyah,1,0,0,0,0,0,0,2.0,4.0,0,0.0,0,7.0,0.0,1,0
3,aanya,1,0,1,0,0,0,0,2.0,3.0,0,0.0,1,5.0,0.0,1,0
4,aarav,0,0,0,0,0,1,0,1.0,3.0,0,0.0,0,5.0,1.0,1,0
5,aarush,0,0,0,0,0,1,0,1.0,3.0,0,0.0,0,6.0,1.0,1,0
6,ab,0,0,0,1,0,0,0,0.0,1.0,0,1.0,0,2.0,0.0,1,0
7,abagail,1,0,0,0,1,0,0,1.0,4.0,0,2.0,0,7.0,0.0,1,0
8,abb,0,0,0,1,0,0,0,0.0,1.0,0,2.0,0,3.0,0.0,1,0
9,abbey,1,0,0,0,0,0,0,1.0,2.0,0,2.0,1,5.0,0.0,1,0


In [138]:
# dfFeatures['Last Letter'] = -1
# dfFeatures['Last Two'] = -1
for c1 in ascii_lowercase:
    dfFeatures.loc[(dfFeatures['Name'].str.endswith(c1)), 'Last Letter'] = ascii_lowercase.index(c1)
    for c2 in ascii_lowercase:
        dfFeatures.loc[(dfFeatures['Name'].str.endswith(c1 + c2)), 
                   'Last Two'] = ascii_lowercase.index(c1) * 26 + ascii_lowercase.index(c2)

ValueError: cannot index with vector containing NA / NaN values

In [113]:
""" Self-tested analysis """
listDrop = ["Name", "dataID"]
dfTest = dfFeatures.drop(listDrop, axis=1)

# Creating logistic regression model that is trained on matrix x containing all numeric attributes and 
# vector y containing corresponding numeric gender binary
model_0= LogisticRegression(max_iter=1000, tol=1e-5, verbose=1)
x = dfTest.drop(['Gender'], axis=1).as_matrix()
y = dfTest[['Gender']].as_matrix().ravel()

model_0.fit(x, y)
x_test = x # testing training model against itself to predict corresponding y values
predictions = model_0.predict(x_test)

# Coefficients and accuracy of training model against itself
coefficients = {}
for i in range(len(dfTest.columns) - 1):
    coefficients[dfTest.columns[i + 1]] = model_0.coef_[0][i]
d = np.asarray(y) - np.asarray(predictions)
accuracy = (len(d) - np.count_nonzero(d)) / len(d)

sorted_x = sorted(coefficients.items(), key=operator.itemgetter(1), reverse=True)
print("Accuracy: " + str(accuracy))
print("\nCoefficients: \n")

sorted_x

[LibLinear]Accuracy: 0.7881712014170665

Coefficients: 



[('A ending?', 2.777374354374381),
 ('Total # of sonorants (m,n,l,r,w,y)', 0.38139787552225235),
 ('1st letter fricative? (f,v,th,s,z,sh,ch,j)', 0.23028899636219463),
 ('1st letter vowel? (a, e, i, o, u)', 0.22258818644831316),
 ('1st letter plosive? (p,b,t,d,k,g)', 0.080519768952285589),
 ('1st letter sonorant? (m,n,l,r,w,y)', 0.07640696327110845),
 ('# of letters', 0.062860609722012808),
 ('Total # of vowels (a, e, i, o, u)', 0.0093899765312054143),
 ('# of fricatives (f,v,th,s,z,sh,ch,j)', -0.033204675305796252),
 ('Total # of plosives  (p,b,t,d,k,g)', -0.16200339440445213),
 ('VowelEnding? (a, e, i, o, u,y)', -0.49016524638265885),
 ('Fricative ending? (f,v,th,s,z,sh,ch,dge)', -2.1866813520749502),
 ('Sonorant ending? (m,n,ng,l,r)', -2.2536606478464658),
 ('Plosive ending? (p,b,t,d,k,g)', -2.3079307879945472)]