Predicting baby name sex using PyTorch for Logistic Regression

### References

- [Logistic Regression with PyTorch](https://proai.org/pytorch-logistic-regression) by [Denny Loevlie](https://twitter.com/DennisLoevlie)
- [PyTorch OO design compared with SkLearn](https://jeancochrane.com/blog/pytorch-functional-api) by [Jean Cochrane](https://JeanCochrane.com)
- [Building Your First Network in PyTorch](https://t.co/m9I4e0tfrC) by [Ta-Ying Cheng](https://www.linkedin.com/in/tim-ta-ying-cheng-411857139/)
- [data.world US baby names since 1880](https://data.world/ssa/baby-names-for-us-states-territories)



In [2]:
import numpy as np
import pandas as pd
# neither year nor len are statistically significant predictors of sex
from pathlib import Path

In [3]:
CWD = Path('.').absolute().resolve()
DATA_DIR = '.nlpia2-data'
DATA_FILE = 'baby-names-region.csv.gz'
CWD

PosixPath('/home/hobs/code/tangibleai/nlpia2/src/nlpia2/ch05')

In [4]:
parent = CWD
data_dir = parent / DATA_DIR 
filepath = data_dir / DATA_FILE
for i in range(10):
    print(filepath)
    if filepath.is_file():
        break
    parent = parent.parent
    data_dir = parent / DATA_DIR 
    filepath = data_dir / DATA_FILE
filepath

/home/hobs/code/tangibleai/nlpia2/src/nlpia2/ch05/.nlpia2-data/baby-names-region.csv.gz
/home/hobs/code/tangibleai/nlpia2/src/nlpia2/.nlpia2-data/baby-names-region.csv.gz
/home/hobs/code/tangibleai/nlpia2/src/.nlpia2-data/baby-names-region.csv.gz
/home/hobs/code/tangibleai/nlpia2/.nlpia2-data/baby-names-region.csv.gz


PosixPath('/home/hobs/code/tangibleai/nlpia2/.nlpia2-data/baby-names-region.csv.gz')

In [5]:
df = pd.read_csv(filepath)

In [6]:
np.random.seed(451)
df = df.sample(10_000)
df.head()

Unnamed: 0,region,sex,year,name,count,freq
6139665,WV,F,1987,Brittani,10,3e-06
2565339,MD,F,1954,Ida,18,5e-06
22297,AK,M,1988,Maxwell,5,1e-06
5114650,TN,F,1972,Charlene,24,8e-06
2126395,KS,M,1954,Todd,11,3e-06


In [7]:
names = df['name'].unique()
names[:10]

array(['Brittani', 'Ida', 'Maxwell', 'Charlene', 'Todd', 'Aubrey',
       'Arianna', 'Otis', 'Trenton', 'Faustino'], dtype=object)

In [8]:
len(names) / len(df)

0.4025

In [9]:
# df = pd.get_dummies(df, columns=['region'])
# df.head()

In [10]:
df = df.groupby(['name', 'sex']).sum()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaden,M,2008,51,1.5e-05
Aahana,F,2018,26,9e-06
Aahil,M,2019,5,2e-06
Aaleyah,F,2010,17,5e-06
Aalia,F,4033,13,4e-06


In [11]:
df['name'] = df.index.get_level_values('name')
df['sex'] = df.index.get_level_values('sex')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aaden,M,2008,51,1.5e-05,Aaden,M
Aahana,F,2018,26,9e-06,Aahana,F
Aahil,M,2019,5,2e-06,Aahil,M
Aaleyah,F,2010,17,5e-06,Aaleyah,F
Aalia,F,4033,13,4e-06,Aalia,F


In [None]:
df_with_duplicates = df;

In [1]:
df.query('name == "Sam"')


NameError: name 'df' is not defined

In [13]:
df.loc[pd.IndexSlice['Chris', :]]

Unnamed: 0_level_0,year,count,freq,name,sex
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,1983,5,2e-06,Chris,F
M,7850,239,6.9e-05,Chris,M


In [14]:
df['istrain'] = np.random.rand(len(df)) < .9
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex,istrain
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaden,M,2008,51,1.5e-05,Aaden,M,True
Aahana,F,2018,26,9e-06,Aahana,F,True
Aahil,M,2019,5,2e-06,Aahil,M,True
Aaleyah,F,2010,17,5e-06,Aaleyah,F,True
Aalia,F,4033,13,4e-06,Aalia,F,True


In [15]:
istrain = df['istrain']
del df['istrain']
istrain.sum() / len(istrain)

0.9042000943841435

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
vectorizer

TfidfVectorizer(analyzer='char', lowercase=False, ngram_range=(1, 3))

In [17]:
vectorizer.fit(df['name'][istrain])

TfidfVectorizer(analyzer='char', lowercase=False, ngram_range=(1, 3))

In [18]:
vecs = vectorizer.transform(df['name'])
vecs = pd.DataFrame.sparse.from_spmatrix(vecs)
vecs.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3653,3654,3655,3656,3657,3658,3659,3660,3661,3662
0,0.193687,0.395438,0.506137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.182726,0.373059,0.0,0.45487,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.185518,0.378759,0.0,0.461821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.166921,0.34079,0.0,0.0,0.38949,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.214766,0.438473,0.0,0.0,0.501131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = df.index
vecs.head()[vecs.columns[:5]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,Aa,Aad,Aah,Aal
name,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aaden,M,0.193687,0.395438,0.506137,0.0,0.0
Aahana,F,0.182726,0.373059,0.0,0.45487,0.0
Aahil,M,0.185518,0.378759,0.0,0.461821,0.0
Aaleyah,F,0.166921,0.34079,0.0,0.0,0.38949
Aalia,F,0.214766,0.438473,0.0,0.0,0.501131


In [20]:
vecs.shape

(4238, 3663)

In [21]:
import torch
torch

<module 'torch' from '/home/hobs/anaconda3/envs/nlpia2/lib/python3.8/site-packages/torch/__init__.py'>

In [22]:
class LogisticRegressionNN(torch.nn.Module):

    def __init__(self, num_features, num_outputs=1):
         super().__init__()
         self.linear = torch.nn.Linear(num_features, num_outputs)

    def forward(self, X):
        return torch.sigmoid(self.linear(X))

In [23]:
def make_tensor(X):
    """ Convert numpy ndarray to torch.Tensor """
    X = getattr(X, 'values', X)
    return X if isinstance(X, torch.Tensor) else torch.Tensor(X)

def make_array(x):
    """ Convert torch.Tensor to numpy 1-D array """
    if hasattr(x, 'detach'):
        return torch.squeeze(x).detach().numpy()
    return x

In [24]:
num_features = vecs.shape[1]  # number of unique n-grams in our "vocabulary"
num_outputs = 1    # number of nesses (sexes) to predict, we're predicting only femaleness

In [25]:
from tqdm import tqdm
import time
import json
import copy

# Fraction of the tensors y_pred and y that are the same 
# (y_pred == y).sum() / len(y)
def measure_binary_accuracy(y_pred, y):
    """ Round y_pred and y then count the preds that are equal to the truth to compute fraction correct """
    y_pred = make_array(y_pred).round()
    y = make_array(y).round()
    num_correct = (y_pred == y).sum()
    return num_correct / len(y)

In [26]:
def measure_performance(model, X_train, X_test, y_train, y_test, criterion):
    with torch.no_grad():
        # Calculating the loss and accuracy for the train dataset
        accuracy_train = measure_binary_accuracy(model(X_train), y_train)
        outputs_test = torch.squeeze(model(X_test))
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        loss_test = criterion(outputs_test, y_test)
        return dict(i=i, 
                    # loss_train=loss.item(),
                    accuracy_train=accuracy_train,
                    loss_test=loss_test.item(),
                    accuracy_test=accuracy_test)

In [27]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

LogisticRegressionNN(
  (linear): Linear(in_features=3663, out_features=1, bias=True)
)

In [28]:
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [29]:
# BCE: Binary Cross Entropy
criterion = torch.nn.BCELoss()
criterion

BCELoss()

In [30]:
def rand_range(min_value=0.001, max_value=1):
    scale = max_value - min_value
    return scale * np.random.rand() + min_value

In [31]:
def rand_range_log(min_value=0.001, max_value=1):
    min_log = np.log(min_value)
    max_log = np.log(max_value)
    return np.exp(rand_range(np.log(min_value), np.log(max_value)))

Create random hyperparameter table for optimizer learning_rate and momentum

In [37]:
# lr: learning_rate
hyperparam_ranges = dict(lr=[0.001, 1.0], momentum=[0.00001, 1.0])
hyperparam_table = []
num_attempts = 10
for i in range(num_attempts):
    hyperparam_values = dict()
    for k, v in hyperparam_ranges.items():
        hyperparam_values[k] = rand_range_log(*hyperparam_ranges[k])
    hyperparam_table.append(hyperparam_values)
for d in hyperparam_table:
    d['momentum'] = 0
pd.DataFrame(hyperparam_table)

Unnamed: 0,lr,momentum
0,0.073095,0
1,0.001478,0
2,0.001076,0
3,0.002069,0
4,0.015807,0
5,0.487281,0
6,0.011604,0
7,0.002934,0
8,0.0119,0
9,0.185356,0


In [38]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

LogisticRegressionNN(
  (linear): Linear(in_features=3663, out_features=1, bias=True)
)

In [39]:
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_table[0])
optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.07309499536069454
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [1]:
# pbar = tqdm(hyperparam_table, desc='Training attempt', total=len(hyperparam_table))
num_epochs=200

t0 = time.time()
for i, hyperparam_values in enumerate(hyperparam_table):
    t1 = time.time()
    model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
    optimizer = torch.optim.Adam(model.parameters(), **hyperparam_values)
    criterion_train = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][istrain].values))  # BCE: Binary Cross Entropy
    criterion_test = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][~istrain].values))  # BCE: Binary Cross Entropy
    X = vecs.values
    y = (df[['sex']] == 'F').values
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor(y[istrain])
    y_test = torch.Tensor(y[~istrain])

    pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)
    results = [None] * num_epochs
    for epoch in pbar_epochs:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion_train(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        loss_train = loss_train.item()
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
        # print(f'Train loss: {np.round(loss_train.detach().numpy(), 4):0.4f}')
        outputs_test = model(X_test)
        loss_test = criterion_test(outputs_test, y_test).item()
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        results[epoch] = dict(loss_train=loss_train, loss_test=loss_test, accuracy_test=accuracy_test)
        # pbar_epochs.set_description(f'loss_train/test: {loss_train:.4f}/{loss_test:.4f}')
    t2 = time.time()
    results[-1]['time_per_attempt'] = t2 - t1
    results[-1]['total_time'] = t2 - t0
    hyperparam_table[i].update(results[-1])
    print(f'attempt {i:04d}/{len(hyperparam_table)}')
    for k, v in hyperparam_table[i].items():
        print(f'{k}: {v}')

NameError: name 'time' is not defined

In [58]:
hyperparam_table[-1]['loss_train'].item()



AttributeError: 'float' object has no attribute 'item'

In [132]:
hyperparam_df = pd.DataFrame(hyperparam_table)
hyperparam_df

Unnamed: 0,lr,momentum,loss_train
0,0.006897,0.000149,"tensor(0.6831, grad_fn=<BinaryCrossEntropyBack..."
1,0.003799,0.002150,"tensor(0.6836, grad_fn=<BinaryCrossEntropyBack..."
2,0.101760,0.005800,"tensor(0.6828, grad_fn=<BinaryCrossEntropyBack..."
3,0.580667,0.237036,"tensor(0.6832, grad_fn=<BinaryCrossEntropyBack..."
4,0.845265,0.000022,"tensor(0.6834, grad_fn=<BinaryCrossEntropyBack..."
...,...,...,...
95,0.042495,0.006755,"tensor(0.6835, grad_fn=<BinaryCrossEntropyBack..."
96,0.014527,0.123773,"tensor(0.6824, grad_fn=<BinaryCrossEntropyBack..."
97,0.008243,0.000038,"tensor(0.6825, grad_fn=<BinaryCrossEntropyBack..."
98,0.005621,0.001474,"tensor(0.6826, grad_fn=<BinaryCrossEntropyBack..."


In [131]:
def fit(model=model, X=vecs.values, y=(df[['sex']] == 'F').values, optimizer=None,
        num_epochs=30, learning_rate=.1, criterion=criterion, optimizer=optimizer):
    pbar = tqdm(range(num_epochs), desc='Epoch', total=num_epochs)
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor(y[istrain])
    y_test = torch.Tensor(y[~istrain])

    results = []
    for i in pbar:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
        # print(f'Train loss: {np.round(loss_train.detach().numpy(), 4):0.4f}')
#         num_learning_rate_steps = 20
#         if not i % np.round(num_epochs / num_learning_rate_steps):
#             results.append(
#                 measure_performance(model=model,
#                                     X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
#                                     criterion=criterion))
#             results[-1]['loss_train'] = loss_train.item()
#             results[-1]['learning_rate'] = learning_rate
#             pbar.set_description(
#                 f'LR: {np.round(learning_rate, 2)}'
#                 f', Test loss: {np.round(results[-1]["loss_test"], 2)}'
#                 f', Train loss: {np.round(results[-1]["loss_train"], 3)}'
#                 f', Test acc: {np.round(results[-1]["accuracy_test"], 2)}'
#                 f', Train acc: {np.round(results[-1]["accuracy_train"], 3)}'
#             )
#             # learning_rate *= .9
    return results

SyntaxError: duplicate argument 'optimizer' in function definition (3048412353.py, line 1)

In [46]:
results = fit()

Epoch:  67%|██████████████████████████████████████████████████████████████████████████████████████████▋                                             | 20/30 [00:00<00:00, 102.87it/s]

Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6725
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724


Epoch: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 106.04it/s]

Train loss: 0.6724
Train loss: 0.6724
Train loss: 0.6724





In [None]:
pd.DataFrame(results)

In [None]:
# model.score(vecs[~istrain], y[~istrain], sample_weight=df['count'][~istrain])

In [None]:
# model.classes_


In [None]:
names = ['Dewey', 'Kemal', 'Copeland', 'Vishvesh']
ourvecs = vectorizer.transform(names)
ourvecs = pd.DataFrame.sparse.from_spmatrix(ourvecs)
ourvecs.columns = vectorizer.get_feature_names_out()
ourvecs.index = list(zip(names, 'M'*len(names)))
ourvecs

In [None]:
ourtensors = 

In [None]:
names = ['Maria', 'Syndee', 'Aditi', 'Constance']
vecs = vectorizer.transform(names)
vecs = pd.DataFrame.sparse.from_spmatrix(vecs)
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = list(zip(names, 'M'*len(names)))
pd.DataFrame(model.predict_proba(vecs)[:,0], index=vecs.index)

In [None]:
class LogisticRegressionNumpyNN(LogisticRegressionNN):

    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

    def predict_proba(self, X):
        return self.forward(make_tensor(X))
    
    def predict(self, X):
        return (np.array(self.forward(make_tesnor(X))) > 0.5).astype(int)
    
# ', '.join([v for v in dir(LogisticRegression) if v[0] != '_'])