# 1. Predicting Gender of Brazilian Names Using Machine Learning


## 1.1 Necessary libraries

In [None]:
import pandas as pd                       
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow import keras
from keras import backend as K
from keras.layers import Dense, Dropout, Flatten, GRU, SimpleRNN, LSTM, Bidirectional, Activation, TimeDistributed
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt

## 1.2 Download the dataset

In [None]:

import requests
url = "https://data.brasil.io/dataset/genero-nomes/nomes.csv.gz"
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

df = pd.read_csv('nomes.csv.gz') 
df.head()

Unnamed: 0,alternative_names,classification,first_name,frequency_female,frequency_male,frequency_total,frequency_group,group_name,ratio
0,AILINE|ALEINE|ALIINE|ALINE|ALINER|ALINHE|ALINN...,F,AALINE,66.0,,66,530550,ALINE,1.0
1,ARAAO|ARAO,M,AARAO,,281.0,281,3526,ARAO,1.0
2,AHARON|AROM|ARON|ARYON|HARON,M,AARON,,676.0,676,3442,ARON,1.0
3,ADA|ADAH|ADAR|ADHA|HADA,F,ABA,82.0,,82,5583,ADA,1.0
4,,M,ABADE,,57.0,57,57,ABADE,1.0


In [None]:
# Ratio = 1.0
#df = df[df['ratio'] == 1.0].copy()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100787 entries, 0 to 100786
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   alternative_names  70745 non-null   object 
 1   classification     100787 non-null  object 
 2   first_name         100787 non-null  object 
 3   frequency_female   60484 non-null   float64
 4   frequency_male     50932 non-null   float64
 5   frequency_total    100787 non-null  int64  
 6   frequency_group    100787 non-null  int64  
 7   group_name         100787 non-null  object 
 8   ratio              100787 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.9+ MB


### 1.2.1 Preparing the data 

In [None]:
y = df['classification'].astype("category").cat.codes.values    # y labels into numbers 0 is F and 1 is M
names = df['first_name'].apply(lambda x: x.lower())             # input names

In [None]:
print("M : " + str(sum(y==1)))
print("F : " + str(sum(y==0)))
print(len(y))

M : 45537
F : 55250
100787


## 1.3 Encoding Words

Neural networks can only learn to find patterns in numerical data, so it is necessary to convert our data into numeric values with word encoding or tokenization. 

In [None]:
#word encoding
maxlen = 20                                               # max lenght of a name
'''Define a vocabulary which corresponds to all the unique letters encountered'''
vocab = set(' '.join([str(i) for i in names]))            # creating a vocab
vocab.add('END')
len_vocab = len(vocab)
''' The dictionary maps each letter of vocabulary to a number '''
char_index = dict((c, i) for i, c in enumerate(vocab))    # creating a dictionary

# Builds an empty line with a 1 at the index of character
def set_flag(i):
    aux = np.zeros(len_vocab);
    aux[i] = 1
    return list(aux)

# Truncate names and create the matrix
def prepare_encod_names(X):
    vec_names = []
    trunc_name = [str(i)[0:maxlen] for i in X]  # consider only the first 20 characters
    for i in trunc_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        vec_names.append(tmp)
    return vec_names

In [None]:
x = prepare_encod_names(names.values)   # Now the names are encod as a vector of numbers 

## 1.4 Split the data into test and train

In [None]:
# train, val, test set will be 60%, 20%, 20% of the dataset respectively
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=28)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=40)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)
x_val = np.asarray(x_val)
y_val = np.asarray(y_val)

In [None]:
def data2df(x, y):
    df_x = pd.DataFrame(data=x.reshape((x.shape[0],-1)))
    df_y = pd.DataFrame(data=y.reshape((y.shape[0],-1)))
    
    df = pd.concat([df_x, df_y], axis=1)
    columns = list(df.columns)
    columns[-1] = 'class'
    df.columns = columns
    df = df.astype('int8')
    return df

In [None]:
df_train = data2df(x_train, y_train)
df_val = data2df(x_val, y_val)
df_test = data2df(x_test, y_test)

In [None]:
print(df_train)

0        0
1        0
2        0
3        0
4        0
        ..
64498    0
64499    0
64500    0
64501    0
64502    1
Name: 0, Length: 64503, dtype: int8


In [None]:
train_size = round(1 - len(df_test)/(len(df_train)+len(df_test)), 2)

In [None]:
train_size

0.76

In [None]:
df = pd.concat([df_train, df_test])

## 1.5 Train models

In [None]:
from pycaret.classification import *

In [None]:
session = setup(data=df, target='class', train_size=train_size, data_split_shuffle=False)

Unnamed: 0,Description,Value
0,session_id,4331
1,Target,class
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(84661, 561)"
5,Missing Values,False
6,Numeric Features,560
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9482,0.9893,0.9351,0.9498,0.9424,0.8954,0.8955,9.748
rf,Random Forest Classifier,0.946,0.9885,0.9311,0.9487,0.9398,0.8908,0.891,6.592
lightgbm,Light Gradient Boosting Machine,0.9222,0.9766,0.9129,0.9152,0.914,0.843,0.843,0.467
dt,Decision Tree Classifier,0.921,0.9201,0.9114,0.9139,0.9126,0.8405,0.8405,1.047
knn,K Neighbors Classifier,0.9034,0.96,0.8649,0.9171,0.8902,0.8042,0.8053,221.95
lr,Logistic Regression,0.8672,0.9417,0.8279,0.8725,0.8496,0.7309,0.7318,23.145
svm,SVM - Linear Kernel,0.8661,0.0,0.8317,0.8684,0.8489,0.7287,0.7303,0.837
ridge,Ridge Classifier,0.8604,0.0,0.7946,0.8855,0.8375,0.7158,0.7191,0.68
gbc,Gradient Boosting Classifier,0.8339,0.936,0.6864,0.9283,0.7891,0.6572,0.6777,6.848
ada,Ada Boost Classifier,0.8263,0.9103,0.7335,0.8629,0.7927,0.645,0.6515,5.661


In [None]:
print(best_model)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=4331, verbose=0,
                     warm_start=False)
