In [1]:
from collections import defaultdict
from math import log
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
names = pd.read_csv('../hw/2/names.csv', names=['name', 'gender'], header=None) 
X_train, X_test = train_test_split(names, test_size=0.3)

In [3]:
def train(samples):
    classes = defaultdict(lambda: 0)
    freq = defaultdict(lambda: 0)
    for feats, label in samples:
        classes[label] += 1  # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1  # count features frequencies

    for label, feat in freq:  # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:  # normalize classes frequencies
        classes[c] /= len(samples)
    print("classes:", len(classes))
    print("freqs:", len(freq))
    return classes, freq  # return P(C) and P(O|C)


In [4]:
def classify(classifier, feats):
    classes, prob = classifier
    print(feats)
    print(prob)
    print(classes.keys())
    print("\n")
    return min(
        classes.keys(),  # calculate argmin(-log(C|O))
        key=lambda cl: -log(classes[cl]) + sum(-log(prob.get((cl, feat), 0.001)) for feat in feats),
    )


In [5]:
def get_features(sample):
    print(sample)
    return sample[-1]  # get last letter


In [8]:
classify(classifier, get_features('Garrick'))

Garrick
k
defaultdict(<function train.<locals>.<lambda> at 0x7ff4458460e0>, {('boy', 'e'): 0.15270326042096574, ('girl', 'n'): 0.08132530120481928, ('boy', 'r'): 0.05984316962443252, ('boy', 'x'): 0.003714403631861329, ('girl', 'e'): 0.25817555938037867, ('boy', 'a'): 0.0330169211721007, ('girl', 'y'): 0.055507745266781414, ('boy', 'y'): 0.09657449442839455, ('girl', 'r'): 0.01333907056798623, ('girl', 'a'): 0.443631669535284, ('boy', 'o'): 0.05530334296326868, ('girl', 'i'): 0.04647160068846816, ('girl', 'l'): 0.025387263339070567, ('boy', 'n'): 0.23194387123400742, ('girl', 'h'): 0.032271944922547334, ('boy', 'k'): 0.01939744118860916, ('boy', 'l'): 0.0751134956665291, ('boy', 'd'): 0.06025588113908378, ('boy', 'h'): 0.02723895996698308, ('girl', 'c'): 0.0008605851979345956, ('boy', 's'): 0.06562113082955015, ('boy', 'm'): 0.018159306644655385, ('boy', 't'): 0.04746182418489476, ('boy', 'g'): 0.009492364836978951, ('girl', 't'): 0.0068846815834767644, ('girl', 'b'): 0.000430292598967

'boy'

In [9]:
X_test['guess'] = X_test.apply(lambda row: classify(classifier, get_features(str(row['name'].strip()))), axis=1)

Barb
b
defaultdict(<function train.<locals>.<lambda> at 0x7ff4458460e0>, {('boy', 'e'): 0.15270326042096574, ('girl', 'n'): 0.08132530120481928, ('boy', 'r'): 0.05984316962443252, ('boy', 'x'): 0.003714403631861329, ('girl', 'e'): 0.25817555938037867, ('boy', 'a'): 0.0330169211721007, ('girl', 'y'): 0.055507745266781414, ('boy', 'y'): 0.09657449442839455, ('girl', 'r'): 0.01333907056798623, ('girl', 'a'): 0.443631669535284, ('boy', 'o'): 0.05530334296326868, ('girl', 'i'): 0.04647160068846816, ('girl', 'l'): 0.025387263339070567, ('boy', 'n'): 0.23194387123400742, ('girl', 'h'): 0.032271944922547334, ('boy', 'k'): 0.01939744118860916, ('boy', 'l'): 0.0751134956665291, ('boy', 'd'): 0.06025588113908378, ('boy', 'h'): 0.02723895996698308, ('girl', 'c'): 0.0008605851979345956, ('boy', 's'): 0.06562113082955015, ('boy', 'm'): 0.018159306644655385, ('boy', 't'): 0.04746182418489476, ('boy', 'g'): 0.009492364836978951, ('girl', 't'): 0.0068846815834767644, ('girl', 'b'): 0.000430292598967297

In [7]:
features = [(get_features(feat), label) for feat, label in zip(X_train['name'], X_train['gender'])]
classifier = train(features)

Lute
Aspen
Christopher
Alex
Edythe
Viola
Bonny
Babyboy
Liller
Elva
Odessa
Reba
Laney
Greggory
Emilia
Gunda
Geno
Paola
Lavona
Lempi
Fredy
Margaretha
Adell
Elena
Darci
Cicero
Grayson
Maliyah
Larry
Mckinley
Vernetta
Brendan
Cordero
Janie
Earnestine
Eugene
Emelie
Maryanne
Connie
Wilfredo
Annie
Polk
Thea
Danyel
Margarete
Dick
Araceli
Trever
Levin
Zane
Janae
Shelba
Chynna
Hadassah
Dawson
Theron
Inga
Bridger
Lizeth
Elodie
Ally
Rosita
Durell
Brittny
Delcie
Elgin
Cheri
Karli
Fatima
Lupe
Javonte
Adeline
Estill
Dionte
Bradford
Iver
Nehemiah
Solomon
Keanna
Jay
Chiquita
Marceline
Joeseph
Jarad
Joella
Hughey
Candyce
Elizah
Jacquelyn
Kamren
Sonny
Zain
Audie
Harry
Sal
Merrilee
Angella
Angelic
Lawton
Melissia
Leala
Jailene
Sigmund
Mitchell
Harold
Judd
Andres
Lige
Adriana
Francisca
Raphael
Kenton
Cyril
Blaine
Harrie
Konnor
Adrianna
Mason
Jacki
Kalie
Dwain
Juli
Raheem
Ruthie
Mauro
Wesley
August
Lawyer
Jazlene
Carissa
Keenen
Ward
Hampton
Juanita
Quiana
Cilla
Mahlon
Zack
Raiden
Francesca
Monique
Kylah
Kirb

In [127]:
print('gender: ', classify(classifier, get_features('Destiney')))



y
gender:  boy


In [128]:
X_test['guess'] = X_test.apply(lambda x: x)

ValueError: Columns must be same length as key

In [129]:
classify(classifier, get_features('Destiney'))



y


'boy'

In [130]:
for row in X_test.iterrows():
    print(row)

(3526, name      Lida
gender    girl
guess      boy
Name: 3526, dtype: object)
(6273, name      Brooklyn
gender        girl
guess          boy
Name: 6273, dtype: object)
(6177, name      Yessenia
gender        girl
guess          boy
Name: 6177, dtype: object)
(2225, name      Derl
gender     boy
guess      boy
Name: 2225, dtype: object)
(6334, name      Shaniece
gender        girl
guess          boy
Name: 6334, dtype: object)
(19, name      Joe
gender    boy
guess     boy
Name: 19, dtype: object)
(5913, name      Takisha
gender       girl
guess         boy
Name: 5913, dtype: object)
(6558, name      Aryanna
gender       girl
guess         boy
Name: 6558, dtype: object)
(426, name      Arther
gender       boy
guess        boy
Name: 426, dtype: object)
(6215, name      Kierra
gender      girl
guess        boy
Name: 6215, dtype: object)
(4271, name       Ala
gender    girl
guess      boy
Name: 4271, dtype: object)
(1410, name      Marlin
gender       boy
guess        boy
Name: 1410, dtyp

In [118]:
X_test.head(20)

Unnamed: 0,name,gender,guess
3526,Lida,girl,boy
6273,Brooklyn,girl,boy
6177,Yessenia,girl,boy
2225,Derl,boy,boy
6334,Shaniece,girl,boy
19,Joe,boy,boy
5913,Takisha,girl,boy
6558,Aryanna,girl,boy
426,Arther,boy,boy
6215,Kierra,girl,boy


In [100]:
type('Milana')

str