Predicting baby name sex using PyTorch for Logistic Regression

### References

- [Logistic Regression with PyTorch](https://proai.org/pytorch-logistic-regression) by [Denny Loevlie](https://twitter.com/DennisLoevlie)
- [PyTorch OO design compared with SkLearn](https://jeancochrane.com/blog/pytorch-functional-api) by [Jean Cochrane](https://JeanCochrane.com)
- [Building Your First Network in PyTorch](https://t.co/m9I4e0tfrC) by [Ta-Ying Cheng](https://www.linkedin.com/in/tim-ta-ying-cheng-411857139/)
- [data.world US baby names since 1880](https://data.world/ssa/baby-names-for-us-states-territories)



In [11]:
>>> import pandas as pd
>>> import numpy as np
>>> pd.options.display.max_rows = 5
>>> np.random.seed(451)

In [12]:
# >>> raw = pd.read_csv('https://proai.org/baby-names-us.csv.gz', index_col=0)
# >>> raw

In [13]:
# >>> df = raw.sample(10_000)

In [14]:
# >>> df = df.set_index(['name', 'sex'])[['count']]
# >>> df

In [15]:
# >>> df.to_csv('name-sex-count-10k.csv.gz')

In [16]:
# >>> raw.groupby(['name', 'sex']).sum().loc[('Maria',)]

In [17]:
>>> df = pd.read_csv('name-sex-count-10k.csv.gz', index_col=[0, 1])
>>> df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
name,sex,Unnamed: 2_level_1
Brittani,F,10
Ida,F,18
...,...,...
Carley,F,11
Kartier,M,10


In [20]:
>>> unique_names = df.index.get_level_values('name').unique()
>>> len(unique_names) / len(df)

0.4025

In [4]:
# df = df.rename(dict(name='name_', sex='sex_'), axis=1)
# df

Unnamed: 0,region,sex_,year,name_,count,freq
6139665,WV,F,1987,Brittani,10,0.000003
2565339,MD,F,1954,Ida,18,0.000005
...,...,...,...,...,...,...
5744351,VA,F,2007,Carley,11,0.000003
5583882,TX,M,2019,Kartier,10,0.000003


In [21]:
# df = df[['name_', 'sex_', 'count']]
>>> df = df.groupby(['name', 'sex']).sum()
>>> df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
name,sex,Unnamed: 2_level_1
Aaden,M,51
Aahana,F,26
...,...,...
Zya,F,8
Zylah,F,5


In [7]:
# df['name'] = df.index.get_level_values('name_')
# df['sex'] = df.index.get_level_values('sex_')
# df

Unnamed: 0_level_0,Unnamed: 1_level_0,count,name,sex
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaden,M,51,Aaden,M
Aahana,F,26,Aahana,F
...,...,...,...,...
Zya,F,8,Zya,F
Zylah,F,5,Zylah,F


In [8]:
# df.to_csv('name-sex-count-10k.csv.gz')
# df

Unnamed: 0_level_0,Unnamed: 1_level_0,count,name,sex
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaden,M,51,Aaden,M
Aahana,F,26,Aahana,F
...,...,...,...,...
Zya,F,8,Zya,F
Zylah,F,5,Zylah,F


In [9]:
# >>> df = pd.read_csv('name-sex-count-10k.csv.gz', index_col=[0,1])
# >>> df

Unnamed: 0_level_0,Unnamed: 1_level_0,count,name,sex
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaden,M,51,Aaden,M
Aahana,F,26,Aahana,F
...,...,...,...,...
Zya,F,8,Zya,F
Zylah,F,5,Zylah,F


After summing over the regions (states), you end up with less than half the original number of unique name-sex pairs.

In [11]:
>>> df.groupby(['name', 'sex'])['count'].sum()[('Maria',)]

sex
F    69
Name: count, dtype: int64

In [None]:
# df = pd.read_csv('https://proai.org/baby-names-us.csv.gz')
>>> raw.groupby(['name', 'sex'])['count'].sum()[('Avi',)]

In [None]:
>>> df.index = pd.MultiIndex.from_tuples(
...     zip(df['name'], df['sex']), names=['name_', 'sex_'])

In [None]:
>>> df['istrain'] = np.random.rand(len(df)) < .9
>>> print(df)

A list of dicts or a dict of dicts is fastest way to create dataframe from groups of rows
#### https://stackoverflow.com/users/8727339/mohit-motwani
#### https://stackoverflow.com/a/57001947/623735

In [None]:
>>> df_most_common = {}
>>> for name, group in df.groupby('name'):
...     row_dict = group.iloc[group['count'].argmax()].to_dict()
...     df_most_common[(name, row_dict['sex'])] = row_dict
>>> df_most_common = pd.DataFrame(df_most_common).T
>>> print(df_most_common)

In [None]:
>>> istest = ~df_most_common['istrain'].astype(bool)
>>> df_most_common['istest'] = istest
>>> print(df_most_common)

In [None]:
df_most_common[['istest', 'istrain']].sum() / len(df_most_common)

In [None]:
istest = df_most_common['istest']
istest

In [None]:
istest.sum()

In [None]:
istest_idx = df_most_common[istest].index
istest_idx[:4]

In [None]:
df['istrain'].sum() / len(df)

In [None]:
df_most_common.index[:5]
# >>> df['istest'] = df_most_common['istest']

In [None]:
>>> df['istest'] = df_most_common['istest'].fillna(False)
>>> df

In [None]:
>>> df['istest'] = df_most_common['istest'].fillna(False)
>>> istestisna = df['istest'].isna()
>>> istrain = ~(df['istest'][~istestisna]).fillna(False)
>>> df['istrain'] = istrain
>>> df['istrain'].sum() / len(df)

In [None]:
>>> df['istest'].sum() / len(df)

In [None]:
istrain = df['istrain'].fillna(False)
istrain.sum() / len(df)

In [None]:
istest = df['istest'].fillna(False)
istest.sum() / len(df)

In [None]:
>>> df['istrain'].sum() / len(df)
0.8589  # <1>
>>> df['istest'].sum() / len(df)
0.0908  # <2>
>>> (df['istrain'] + df['istest']).sum() / len(df)

In [None]:
istest.sum() / len(df)

In [None]:
istrain.sum() / len(df)

In [None]:
istest = df['istest']
# del df['istrain']
# del df['istest']
df['istest'].sum() / len(istest)

In [None]:
>>> from sklearn.feature_extraction.text import TfidfVectorizer
>>> vectorizer = TfidfVectorizer(use_idf=False,
...     analyzer='char', ngram_range=(1, 3),
...     lowercase=False)
>>> vectorizer

In [None]:
>>> istest = df['istest'].astype(bool)
>>> istrain = df['istrain'].astype(bool)
>>> vectorizer.fit(df['name'][istrain])

In [None]:
len(df)

In [None]:
istrain.sum()

In [None]:
istest = df['istest']
istest.sum()

In [None]:
vecs = vectorizer.transform(df['name'])
vecs = pd.DataFrame.sparse.from_spmatrix(vecs)
vecs.head()


In [None]:
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = df.index
vecs.head()[vecs.columns[:5]]

In [None]:
vecs.shape

In [None]:
import torch
torch

In [None]:
class LogisticRegressionNN(torch.nn.Module):

    def __init__(self, num_features, num_outputs=1):
         super().__init__()
         self.linear = torch.nn.Linear(num_features, num_outputs)

    def forward(self, X):
        return torch.sigmoid(self.linear(X))

In [None]:
def make_tensor(X):
    """ Convert numpy ndarray to torch.Tensor """
    X = getattr(X, 'values', X)
    return X if isinstance(X, torch.Tensor) else torch.Tensor(X)

def make_array(x):
    """ Convert torch.Tensor to numpy 1-D array """
    if hasattr(x, 'detach'):
        return torch.squeeze(x).detach().numpy()
    return x

In [None]:
num_features = vecs.shape[1]  # number of unique n-grams in our "vocabulary"
num_outputs = 1    # number of nesses (sexes) to predict, we're predicting only femaleness

In [None]:
from tqdm import tqdm
import time
import json
import copy

# Fraction of the tensors y_pred and y that are the same 
# (y_pred == y).sum() / len(y)
def measure_binary_accuracy(y_pred, y):
    """ Round y_pred and y then count the preds that are equal to the truth to compute fraction correct """
    y_pred = make_array(y_pred).round()
    y = make_array(y).round()
    num_correct = (y_pred == y).sum()
    return num_correct / len(y)

In [None]:
def measure_performance(model, X_train, X_test, y_train, y_test, criterion):
    with torch.no_grad():
        # Calculating the loss and accuracy for the train dataset
        accuracy_train = measure_binary_accuracy(model(X_train), y_train)
        outputs_test = torch.squeeze(model(X_test))
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        loss_test = criterion(outputs_test, y_test)
        return dict(i=i, 
                    # loss_train=loss.item(),
                    accuracy_train=accuracy_train,
                    loss_test=loss_test.item(),
                    accuracy_test=accuracy_test)

In [None]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

In [None]:
learning_rate = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer

In [None]:
# BCE: Binary Cross Entropy
criterion = torch.nn.BCELoss(weight=torch.Tensor(df[['count']].values))
criterion

In [None]:
# Create new majority sex column to hold the most common sex for names used for both sexes
df['majority_sex'] = df['sex']

# Overwrite the minority sex with the majority.
for name_, sex_ in df_most_common.index:
    opposite_sex = 'F' if sex_ == 'M' else 'M'
    try:
        # if you put majority_sex after .iloc this fails to overwrite data and generates warning
        df['majority_sex'].loc[(name_, opposite_sex)] = sex_
        print(f'overwrote ({name_}, {opposite_sex}) with {sex_}')
    except KeyError:
        pass


In [None]:
dupe_names = df[df['name'].duplicated()]['name'].values
df[df['name'].isin(dupe_names)]

In [None]:
weight_test = torch.Tensor(df[['count']].loc[df_most_common.index].values)
criterion_test = torch.nn.BCELoss(weight=weight_test)
criterion_test

In [None]:
def rand_range(min_value=0.001, max_value=1):
    scale = max_value - min_value
    return scale * np.random.rand() + min_value

In [None]:
def rand_range_log(min_value=0.001, max_value=1):
    min_log = np.log(min_value)
    max_log = np.log(max_value)
    return np.exp(rand_range(np.log(min_value), np.log(max_value)))

Create random hyperparameter table for optimizer learning_rate and momentum

In [None]:
# lr: learning_rate
hyperparam_ranges = dict(lr=[0.02, 1.0], momentum=[0.00001, 1.0])
hyperparam_table = []
num_attempts = 30
for i in range(num_attempts):
    hyperparam_values = dict()
    for k, v in hyperparam_ranges.items():
        hyperparam_values[k] = rand_range_log(*hyperparam_ranges[k])
    hyperparam_table.append(hyperparam_values)
pd.DataFrame(hyperparam_table)

In [None]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

In [None]:
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_table[0])
optimizer

In [None]:
weight_train = torch.Tensor(df['count'][istrain].values.reshape(-1, 1) / df['count'][istrain].mean())
weight_train.sum()

In [None]:
df['count'][istrain].values.reshape(sum(istrain), 1)

In [None]:
weight_train.shape

In [None]:
weight_test = torch.Tensor(df['count'][~istrain].values.reshape(-1, 1) / df['count'][~istrain].mean())
weight_test.sum()

In [None]:
# pbar = tqdm(hyperparam_table, desc='Training attempt', total=len(hyperparam_table))
num_epochs=1000

t0 = time.time()
for i, hyperparam_values in enumerate(hyperparam_table):
    t1 = time.time()
    model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
    optimizer = torch.optim.SGD(model.parameters(), lr=hyperparam_values['lr'])
    # BCE: Binary Cross Entropy weighted by the number of babies with that first name and sex
    criterion_train = torch.nn.BCELoss(weight=weight_train)
    criterion_test = torch.nn.BCELoss(weight=weight_test)
    X = vecs.values
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor((df[['sex']] == 'F').astype(int).values[istrain])
    y_test = torch.Tensor((df[['majority_sex']] == 'F').astype(int).values[~istrain])
    
    pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)
    results = [None] * num_epochs
    for epoch in pbar_epochs:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion_train(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        loss_train = loss_train.item()
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
        # print(f'Train loss: {np.round(loss_train.detach().numpy(), 4):0.4f}')
        outputs_test = model(X_test)
        loss_test = criterion_test(outputs_test, y_test).item()
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        results[epoch] = dict(loss_train=loss_train, loss_test=loss_test, accuracy_test=accuracy_test)
        # pbar_epochs.set_description(f'loss_train/test: {loss_train:.4f}/{loss_test:.4f}')
    t2 = time.time()
    results[-1]['time_per_attempt'] = t2 - t1
    results[-1]['total_time'] = t2 - t0
    hyperparam_table[i].update(results[-1])
    print(f'attempt {i}/{len(hyperparam_table)}')
    for k, v in hyperparam_table[i].items():
        print(f'{k}: {v:04f}')

In [None]:
hyperparam_table[-1]




In [None]:
hyperparam_df = pd.DataFrame(hyperparam_table).sort_values('accuracy_test')
hyperparam_df

In [None]:
def fit(model=model, X=vecs.values, y=(df[['sex']] == 'F').values, optimizer=None,
        num_epochs=30, learning_rate=.1, criterion=criterion, optimizer=optimizer):
    pbar = tqdm(range(num_epochs), desc='Epoch', total=num_epochs)
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor(y[istrain])
    y_test = torch.Tensor(y[~istrain])

    results = []
    for i in pbar:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
    return results

In [None]:
test_loss = hyperparam_df['loss_test']
test_accuracy = hyperparam_df['accuracy_test']
hyperparam_df.plot(x='lr', y='momentum', kind='scatter', logy=True, logx=True, grid=True, marker='o', s=100*test_accuracy**2, alpha=test_loss*.7)

In [None]:
results = fit()

In [None]:
pd.DataFrame(results)

In [None]:
# model.score(vecs[~istrain], y[~istrain], sample_weight=df['count'][~istrain])

In [None]:
# model.classes_


In [None]:
names = ['Dewey', 'Kemal', 'Copeland', 'Vishvesh']
ourvecs = vectorizer.transform(names)
ourvecs = pd.DataFrame.sparse.from_spmatrix(ourvecs)
ourvecs.columns = vectorizer.get_feature_names_out()
ourvecs.index = list(zip(names, 'M'*len(names)))
ourvecs

In [None]:
ourtensors = 

In [None]:
names = ['Maria', 'Syndee', 'Aditi', 'Constance']
vecs = vectorizer.transform(names)
vecs = pd.DataFrame.sparse.from_spmatrix(vecs)
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = list(zip(names, 'M'*len(names)))
pd.DataFrame(model.predict_proba(vecs)[:,0], index=vecs.index)

In [None]:
class LogisticRegressionNumpyNN(LogisticRegressionNN):

    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

    def predict_proba(self, X):
        return self.forward(make_tensor(X))
    
    def predict(self, X):
        return (np.array(self.forward(make_tesnor(X))) > 0.5).astype(int)
    
# ', '.join([v for v in dir(LogisticRegression) if v[0] != '_'])