# Statistical Learning and Deep Learning HW3



### Q1

#### Q1.1

1. Read the dataset.

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing

df = pd.read_csv('./ds/namesex_data_v2.csv', sep=',')
print(df.head())

  gname  sex  fold
0    承憲    1     9
1    均平    1     7
2    思安    0     6
3    佑誠    1     3
4    乃馨    0     0


2. Split the dataset into training, validation, stack, and test.

In [2]:
# split by fold value
train = df[df['fold'] <= 6]
valid = df[df['fold'] == 7]
stack = df[df['fold'] == 8]
test = df[df['fold'] == 9]
    
train = train.reset_index()
valid = valid.reset_index()
stack = stack.reset_index()
test = test.reset_index()

# split x, y
x_train = train['gname']
y_train = train['sex'].to_numpy()
x_valid = valid['gname']
y_valid = valid['sex'].to_numpy()
x_stack = stack['gname']
y_stack = stack['sex'].to_numpy()
x_test = test['gname']
y_test = test['sex'].to_numpy()

print(f'x_train: {x_train.shape}, y_train: {y_train.shape}')
print(f'x_valid: {x_valid.shape}, y_valid: {y_valid.shape}')
print(f'x_stack: {x_stack.shape}, y_stack: {y_stack.shape}')
print(f'x_test: {x_test.shape}, y_test: {y_test.shape}')

x_train: (7483,), y_train: (7483,)
x_valid: (1110,), y_valid: (1110,)
x_stack: (1073,), y_stack: (1073,)
x_test: (1064,), y_test: (1064,)


3. Apply one-hot encoding.

In [3]:
feature = list()
for name in list(x_train.to_numpy()):
    if len(name) > 1:
        feature += list(name)
    feature.append(name)
feature_set = set(feature) - set(' ')
feature_count = dict((f, feature.count(f)) for f in feature_set)
feature_count = dict(filter(lambda pair: pair[1] >= 2, feature_count.items()))
feature = list(feature_count.keys())
feature.append('_Other_Feature_')
print(f'# of feature: {len(feature)}')

# of feature: 1630


In [6]:
def oneHot(df):
    one_hot = pd.DataFrame(0, index=np.arange(df.shape[0]), columns=feature)
    for idx in range(df.shape[0]):
        # create feature list
        f_list = list(df[idx])
        if len(df[idx]) > 1:
            f_list.append(df[idx])
        cnt = 0
        # label
        for f in f_list:
            if f in feature:
                one_hot[f][idx] = 1
                cnt += 1
        # label other feature
        if cnt < len(f_list):
            one_hot['_Other_Feature_'][idx] = 1
    return one_hot, one_hot.columns

x_train, x_train_col = oneHot(x_train)
x_valid, x_valid_col = oneHot(x_valid)
x_stack, x_stack_col = oneHot(x_stack)
x_test, x_test_col = oneHot(x_test)

print(f'x_train: {x_train.shape}, y_train: {y_train.shape}')
print(f'x_valid: {x_valid.shape}, y_valid: {y_valid.shape}')
print(f'x_stack: {x_stack.shape}, y_stack: {y_stack.shape}')
print(f'x_test: {x_test.shape}, y_test: {y_test.shape}')

x_train: (7483, 1630), y_train: (7483,)
x_valid: (1110, 1630), y_valid: (1110,)
x_stack: (1073, 1630), y_stack: (1073,)
x_test: (1064, 1630), y_test: (1064,)


#### Q1.2



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

# tune coefficient c
c_coef = np.geomspace(0.0001, 1000, num=20)
print(f'test c from {c_coef}')

best_c = 0.0001
best_f1 = -np.inf
for c in c_coef:
    clf = LogisticRegression(C=c, max_iter=1000).fit(x_train, y_train)
    y_pred = clf.predict(x_valid)
    f1 = f1_score(y_valid, y_pred)
    print(f'c: {c}, f1 score: {f1}')
    
    if f1 > best_f1:
        best_c = c
        best_f1 = f1
print(f'best c: {best_c}')
print(f'best f1: {best_f1}')

test c from [1.00000000e-04 2.33572147e-04 5.45559478e-04 1.27427499e-03
 2.97635144e-03 6.95192796e-03 1.62377674e-02 3.79269019e-02
 8.85866790e-02 2.06913808e-01 4.83293024e-01 1.12883789e+00
 2.63665090e+00 6.15848211e+00 1.43844989e+01 3.35981829e+01
 7.84759970e+01 1.83298071e+02 4.28133240e+02 1.00000000e+03]
c: 0.0001, f1 score: 0.0
c: 0.00023357214690901214, f1 score: 0.0
c: 0.000545559478116852, f1 score: 0.007532956685499058
c: 0.0012742749857031334, f1 score: 0.3879699248120301
c: 0.002976351441631319, f1 score: 0.7491785323110624
c: 0.0069519279617756054, f1 score: 0.8430232558139534
c: 0.01623776739188721, f1 score: 0.8638941398865785
c: 0.0379269019073225, f1 score: 0.876763875823142
c: 0.08858667904100823, f1 score: 0.887841658812441
c: 0.2069138081114788, f1 score: 0.8895184135977338
c: 0.4832930238571752, f1 score: 0.8932955618508026
c: 1.1288378916846884, f1 score: 0.8945386064030133
c: 2.6366508987303554, f1 score: 0.8953817153628653
c: 6.1584821106602545, f1 score:

In [9]:
# train
x_train_valid = np.concatenate((x_train, x_valid), axis=0)
y_train_valid = np.concatenate((y_train, y_valid), axis=0)
clf = LogisticRegression(C=best_c, max_iter=1000).fit(x_train_valid, y_train_valid)
y_pred = clf.predict(x_test)
print(f'test accuracy: {accuracy_score(y_test, y_pred)}')
print(f'test precision: {precision_score(y_test, y_pred)}')
print(f'test recall: {recall_score(y_test, y_pred)}')
print(f'test f1_score: {f1_score(y_test, y_pred)}')

test accuracy: 0.8843984962406015
test precision: 0.8596837944664032
test recall: 0.893223819301848
test f1_score: 0.8761329305135951


In [23]:
coef = clf.coef_[0]
abs_coef = np.abs(coef)
max20 = abs_coef.argsort()[-20:]
print('20 features with max absolute coefficient:')
for idx in max20:
    print(x_test_col[idx], coef[idx])

20 features with max absolute coefficient:
蓉 -7.591875204860507
松 7.611568050486719
靜 -7.651856120839868
卉 -7.81030203147433
絃 -7.847451300799661
嵐 -7.8865929876929535
潔 -8.027304272764102
綾 -8.13603792095069
婕 -8.149280074918783
妤 -8.153022708560817
薇 -8.213300618176929
鋒 8.215409855051858
萱 -8.286320153200782
美 -8.317119550063008
婷 -8.385077004701543
凌 -8.445682823137451
雯 -8.472757656139578
玲 -8.634615167127475
森 8.69677968696523
傑 9.267762713895557


Accuracy, precision, recall, f1_score are all above 0.8.
As for the features, we can see that words often seen in boys' names have positive coefficients, while those often seen in girls' names have negative coefficients.