In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# cross_validation depracited replaced by model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df_imiona = pd.read_csv("https://sebkaz.github.io/teaching/PrzetwarzanieDanych/data/polish_names.csv")

In [3]:
df_imiona.sample(3)

Unnamed: 0,name,gender
1240,Serwacy,m
117,Artus,m
1281,Spycisław,m


In [4]:
alphabet = list('aąbcćdeęfghijklłmnoópqrstuvwxyzźż')
letter_to_num = {}
cnt = 1
for l in alphabet:
    letter_to_num[l] = cnt
    cnt += 1

def name_to_dummy_vector(name: str, vect_len: int):
    result = []
    if vect_len > len(result):
        for _ in range(vect_len - len(name)):
            result.append(0)
    for l in list(name.lower()):
        if l not in letter_to_num:
            result.append(0)
        else:
            result.append(letter_to_num[l])
    
    return result


max_name_len = max(df_imiona['name'].map(str).apply(len))
df_imiona['target'] = df_imiona['gender'].map(lambda data: 1 if data == 'm' else 0)
df_imiona['name_vect'] = df_imiona['name'].map(lambda name: name_to_dummy_vector(name, max_name_len))

df_imiona.sample(10)

Unnamed: 0,name,gender,target,name_vect
341,Dezyderia,f,0,"[0, 0, 0, 0, 6, 7, 31, 30, 6, 7, 23, 12, 1]"
655,Jarogniew,m,1,"[0, 0, 0, 0, 13, 1, 23, 19, 10, 18, 12, 7, 28]"
288,Cieszymysł,m,1,"[0, 0, 0, 4, 12, 7, 24, 31, 30, 17, 30, 24, 16]"
766,Kryspin,m,1,"[0, 0, 0, 0, 0, 0, 14, 23, 30, 24, 21, 12, 18]"
1375,Tomira,f,0,"[0, 0, 0, 0, 0, 0, 0, 25, 19, 17, 12, 23, 1]"
1426,Warcisław,m,1,"[0, 0, 0, 0, 28, 1, 23, 4, 12, 24, 16, 1, 28]"
344,Diana,f,0,"[0, 0, 0, 0, 0, 0, 0, 0, 6, 12, 1, 18, 1]"
266,Cezariusz,m,1,"[0, 0, 0, 0, 4, 7, 31, 1, 23, 12, 26, 24, 31]"
866,Ludwik,m,1,"[0, 0, 0, 0, 0, 0, 0, 15, 26, 6, 28, 12, 14]"
787,Lamberta,f,0,"[0, 0, 0, 0, 0, 15, 1, 17, 3, 7, 23, 25, 1]"


In [5]:
def generate_feature_labels(n, prefix='f_'):
    result = []
    for i in range(n):
        result.append(f'{prefix}{i}')
    return result

feature_labels = generate_feature_labels(max_name_len)

expand_column = lambda x: pd.Series([i for i in x])
df_expanded_name = df_imiona['name_vect'].apply(expand_column)

In [6]:
df = df_imiona.merge(df_expanded_name, left_index=True, right_index=True)

In [7]:
lab_to_rename = {}
cnt = 0
for l in feature_labels:
    lab_to_rename[cnt] = l
    cnt += 1

df.rename(columns=lab_to_rename, inplace=True)

X = df[feature_labels]
y = df['target']

In [8]:
df[['name'] + feature_labels]

Unnamed: 0,name,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12
0,Abdon,0,0,0,0,0,0,0,0,1,3,6,19,18
1,Abel,0,0,0,0,0,0,0,0,0,1,3,7,15
2,Abercjusz,0,0,0,0,1,3,7,23,4,13,26,24,31
3,Abraham,0,0,0,0,0,0,1,3,23,1,11,1,17
4,Absalon,0,0,0,0,0,0,1,3,24,1,15,19,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,Zbigniewa,0,0,0,0,31,3,12,10,18,12,7,28,1
1701,Zygfryda,0,0,0,0,0,31,30,10,9,23,30,6,1
1702,Hermenia,0,0,0,0,0,11,7,23,17,7,18,12,1
1703,Hermes,0,0,0,0,0,0,0,11,7,23,17,7,24


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [10]:
clf_nb = MultinomialNB()
clf_nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
y_pred_nb = clf_nb.predict(x_test)

In [12]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_nb))

Accuracy: 0.9633431085043989


In [13]:
clf_rf = RandomForestClassifier(n_estimators=100)
clf_rf.fit(x_train, y_train)
y_pred_rfc = clf_rf.predict(x_test)

In [14]:
metrics.accuracy_score(y_test, y_pred_rfc)

0.9882697947214076