In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F
import torch.optim
import sklearn.model_selection
np.set_printoptions(precision=2, suppress=True)

In [2]:
class G:
    # global variables
    pass

In [3]:
G.df_train_full = pd.read_csv('X_train', dtype=np.int32)
G.df_test_full = pd.read_csv('X_test', dtype=np.int32)
G.y_train = torch.tensor(
    np.array(open('Y_train').read().strip('\n').split('\n'),
             dtype=np.int8),
    dtype=torch.float64)

In [4]:
def extract(df):
    c = len(df.columns)
    n = len(df)
    d = 1 + c + 5

    X = torch.zeros((n, d), dtype=torch.float64)
    X[:, 0] = 1  # bias
    X[:, 1:1+c] = torch.tensor(df.values)

    # quadratic term
    # ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    idx = [1,2,4,5,6]
    X[:, 1+c:1+c+5] = X[:, idx] ** 2

    return X

def preprocess(df_train, df_test):
    X_train = extract(df_train)
    X_test = extract(df_test)
    X_full = torch.cat((X_train, X_test), dim=0)
    
    means = torch.mean(X_full, dim=0)
    stds = torch.std(X_full, dim=0)

    # normalize
    # ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    idx = [1,2,4,5,6]
    X_train[:, idx] = (X_train[:, idx] - means[idx]) / stds[idx]
    X_test[:, idx] = (X_test[:, idx] - means[idx]) / stds[idx]

    return X_train, X_test, means, stds

G.X_train, G.X_test, G.means, G.stds = preprocess(G.df_train_full, G.df_test_full)

In [5]:
# np.savetxt('a.csv',G.X_train,fmt='%.2f',delimiter=',')

In [19]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = torch.nn.Linear(dim, 1, bias=False).type(torch.float64)

    def forward(self, x):
        fx = F.sigmoid(self.linear(x))
        return fx
    
    def weights(self):
        return (self.linear.weight, self.linear.bias)

In [20]:
class Trainer:
    def __init__(self, k, dim):
        self.k = k
        self.kfold = sklearn.model_selection.KFold(k)
        self.models = [LogisticRegression(dim) for _ in range(k+1)]
        self.optim = [torch.optim.Adam(
                            self.models[i].parameters())
                      for i in range(k+1)]
        self.loss_func = torch.nn.BCELoss()
        self.step_ctr = 0
        self.train_loss = None
        self.val_accur = None
    
    def __getstate__(self):
        # Copy the object's state from self.__dict__ which contains
        # all our instance attributes. Always use the dict.copy()
        # method to avoid modifying the original state.
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        del state['X']
        del state['Xval']
        del state['y']
        del state['yval']
        return state
    
    def setdata(self, X, y):
        self.n, self.d = X.shape
        
        # K-fold cross validation sets, index 0 is full dataset
        self.X = [X]
        self.Xval = [None]
        self.y = [y]
        self.yval = [None]
        
        for idx_train, idx_val in self.kfold.split(X, y):
            self.X.append(X[idx_train])
            self.y.append(y[idx_train])
            self.Xval.append(X[idx_val])
            self.yval.append(y[idx_val])
        
        # one-hot form of y
#         self.y_oh = []
#         self.yval_oh = []
#         for i in range(self.k+1):
#             self.y_oh.append(torch.zeros((len(self.y[i]), 2)))
#             self.y_oh[i].index_select(0, self.y[i].type(torch.int64))
#             if i == 0:
#                 self.yval_oh.append(None)
#             else:
#                 self.yval_oh.append(torch.zeros((len(self.yval[i]), 2)))
#                 self.yval_oh[i].index_select(0, self.yval[i].type(torch.int64))
            
    def _step(self, i, steps):
        for j in range(steps):
            fx = self.models[i](self.X[i])
            loss = self.loss_func(fx, self.y[i])
            self.optim[i].zero_grad()
            loss.backward()
            print(loss)
            print(self.models[i].linear.weight.grad)
            self.optim[i].step()
        return loss.item()
    
    def step(self, steps=1, log=True):
        train_loss = [self._step(i, steps) for i in range(self.k+1)]
        val_accur = []
        for i in range(1, self.k+1):
            y_pred = torch.round(torch.squeeze(self.models[i](self.Xval[i]))).type(torch.bool)
            val_accur.append(torch.sum(y_pred == self.yval[i]) / (len(self.yval[i]) + 1e-10))
            
        if log:
            print('train loss', np.mean(train_loss), train_loss)
            print('validation accuracy', np.mean(val_accur), val_accur)
            print(self.models[0].weights())
            print()
            
        self.step_ctr += steps
        self.train_loss = train_loss
        self.val_accur = val_accur

In [21]:
G.trainer = Trainer(3, G.X_train.shape[1])
G.trainer.setdata(G.X_train, G.y_train)
num_iter = 1
for i in range(num_iter):
    print('i', i)
    G.trainer.step(10, True)
#         joblib.dump(G.model, '2/'+str(i)+'.pkl')


i 0
tensor(76.0634, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=torch.float64)
tensor(76.0634, dtype=torch.float64, grad_fn=<BinaryCrossEntropyBackward>)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        

In [9]:
def predict(X, w):
    y = sigmoid(X @ w)
    print(y[:100])
    y = np.rint(y).astype(np.int8)
    return y

G.y_test = predict(G.X_test, G.model.w[0])#joblib.load('logistic-85589.pkl'))
df_pred = pd.DataFrame({
    'id': np.arange(1, len(G.X_test)+1),
    'label': G.y_test
})
df_pred.to_csv('submission.csv', index=False)
print(df_pred['label'].values[:100])

[0.   0.16 0.31 0.83 0.   0.01 0.01 0.87 0.   0.04 0.68 0.6  0.01 0.25
 0.6  0.86 0.   0.34 0.01 0.83 0.65 0.   0.   0.03 0.37 0.87 0.   0.01
 0.3  0.07 0.91 0.01 0.05 0.17 0.01 0.21 0.6  0.   0.   0.   0.76 0.59
 0.14 0.04 0.   0.41 0.06 0.77 0.   0.18 0.   0.01 0.   0.38 0.04 0.
 0.28 0.86 0.9  0.12 0.09 0.11 0.06 0.   0.03 0.02 0.99 0.   0.07 0.42
 1.   0.62 0.06 0.   0.06 0.   0.26 0.23 0.35 0.   0.42 0.08 0.03 1.
 0.2  0.02 0.   0.51 0.01 0.86 0.1  0.01 0.8  0.27 0.04 0.   0.02 0.1
 0.32 0.   0.28 0.01 0.   0.01 0.03 0.01 0.4  0.02 0.   0.   0.   0.18
 0.41 0.   0.22 0.01 0.32 0.   0.03 0.1  0.08 0.   0.01 0.07 0.01 0.01
 0.95 0.01 0.19 0.97 0.42 0.   0.   0.89 0.35 0.03 0.41 0.09 0.2  0.02
 0.76 0.   0.94 0.85 0.03 0.06 0.09 0.25 1.   0.02 0.   0.03 0.05 0.01
 0.78 0.   0.01 0.03 0.01 0.33 0.07 0.23 0.09 0.04 0.77 0.39 0.62 0.35
 0.   0.1  0.79 0.22 0.02 0.48 0.06 0.54 0.13 0.25 0.35 0.59 0.39 1.
 0.03 0.3  0.03 0.08 0.05 0.01 0.03 0.   0.88 0.29 0.04 0.01 0.01 0.03
 0.1  0.94 0.

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,