In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
# cross_validation depracited replaced by model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
df_imiona = pd.read_csv("https://sebkaz.github.io/teaching/PrzetwarzanieDanych/data/polish_names.csv")

In [3]:
df_imiona.sample(3)

Unnamed: 0,name,gender
872,Lutogniew,m
1161,Raissa,f
1072,Paloma,f


In [4]:
alphabet = list('aąbcćdeęfghijklłmnoópqrstuvwxyzźż')
letter_to_num = {}
cnt = 1
for l in alphabet:
    letter_to_num[l] = cnt
    cnt += 1

def name_to_dummy_vector(name: str, vect_len: int):
    result = []
    if vect_len > len(result):
        for _ in range(vect_len - len(name)):
            result.append(0)
    for l in list(name.lower()):
        if l not in letter_to_num:
            result.append(0)
        else:
            result.append(letter_to_num[l])
    
    return result


max_name_len = max(df_imiona['name'].map(str).apply(len))
df_imiona['target'] = df_imiona['gender'].map(lambda data: 1 if data == 'm' else 0)
df_imiona['name_vect'] = df_imiona['name'].map(lambda name: name_to_dummy_vector(name, max_name_len))

df_imiona.sample(10)

Unnamed: 0,name,gender,target,name_vect
337,Demetria,f,0,"[0, 0, 0, 0, 0, 6, 7, 17, 7, 25, 23, 12, 1]"
1110,Prochor,m,1,"[0, 0, 0, 0, 0, 0, 21, 23, 19, 4, 11, 19, 23]"
1039,Odo,m,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 6, 19]"
1578,Dajana,f,0,"[0, 0, 0, 0, 0, 0, 0, 6, 1, 13, 1, 18, 1]"
618,Ireneusz,m,1,"[0, 0, 0, 0, 0, 12, 23, 7, 18, 7, 26, 24, 31]"
1157,Rafael,m,1,"[0, 0, 0, 0, 0, 0, 0, 23, 1, 9, 1, 7, 15]"
667,Jewdocha,f,0,"[0, 0, 0, 0, 0, 13, 7, 28, 6, 19, 4, 11, 1]"
110,Arletta,f,0,"[0, 0, 0, 0, 0, 0, 1, 23, 15, 7, 25, 25, 1]"
12,Adaukt,m,1,"[0, 0, 0, 0, 0, 0, 0, 1, 6, 1, 26, 14, 25]"
718,Kasjan,m,1,"[0, 0, 0, 0, 0, 0, 0, 14, 1, 24, 13, 1, 18]"


In [5]:
def generate_feature_labels(n, prefix='f_'):
    result = []
    for i in range(n):
        result.append(f'{prefix}{i}')
    return result

feature_labels = generate_feature_labels(max_name_len)

expand_column = lambda x: pd.Series([i for i in x])
df_expanded_name = df_imiona['name_vect'].apply(expand_column)

In [6]:
df = df_imiona.merge(df_expanded_name, left_index=True, right_index=True)

In [7]:
lab_to_rename = {}
cnt = 0
for l in feature_labels:
    lab_to_rename[cnt] = l
    cnt += 1

df.rename(columns=lab_to_rename, inplace=True)

X = df[feature_labels]
y = df['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [9]:
clf_nb = MultinomialNB()
clf_nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
y_pred_nb = clf_nb.predict(x_test)

In [11]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_nb))

Accuracy: 0.9706744868035191


In [12]:
clf_rf = RandomForestClassifier(n_estimators=100)
clf_rf.fit(x_train, y_train)
y_pred_rfc = clf_rf.predict(x_test)

In [13]:
metrics.accuracy_score(y_test, y_pred_rfc)

0.9912023460410557