In [1]:
import pandas as pd
import numpy as np
import joblib
np.set_printoptions(precision=2, suppress=True)

In [2]:
class G:
    # global variables
    pass

In [3]:
class LogisticRegression:
    def __init__(self, X, y, b1=0.99, b2=0.999):
        self.n = self.d = 0
        self.b1 = b1
        self.b2 = b2

        self.w = [np.zeros(self.d) for _ in range(4)]
        self.m = [np.zeros(self.d) for _ in range(4)]
        self.v = [0, 0, 0, 0]
        self.step_ctr = 0
    
    def __getstate__(self):
        # Copy the object's state from self.__dict__ which contains
        # all our instance attributes. Always use the dict.copy()
        # method to avoid modifying the original state.
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        del state['X']
        del state['XT']
        del state['Xval']
        del state['y']
        del state['yval']
        return state
    
    def setdata(self, X, y):
        self.n, self.d = X.shape

        self.X = [X[~mask[i], :] for i in range(4)]
        self.XT = [self.X[i].T for i in range(len(self.X))]
        self.Xval = [X[mask[i], :] for i in range(4)]
        self.y = [y[~mask[i]] for i in range(4)]
        self.yval = [y[mask[i]] for i in range(4)]
        
        # 3-fold cross validation sets, 0 is full
        ncv = (self.n + 2) // 3
        mask = [np.zeros(self.n, dtype=np.bool_) for _ in range(4)]
        for i in range(1, 4):
            mask[i][(i-1)*ncv:i*ncv] = True        

    def _step(self, i, steps):
        for j in range(steps):
            fx = sigmoid(self.X[i] @ self.w[i])
            grad = self.XT[i] @ (fx - self.y[i])
            self.m[i] = self.b1 * self.m[i] + (1 - self.b1) * grad
            self.v[i] = self.b2 * self.v[i] + (1 - self.b2) * np.sum(grad ** 2)
            self.w[i] = self.w[i] - self.m[i] / (1 - self.b1) / np.sqrt(self.v[i] / (1 - self.b2))
        loss = - self.y[i] @ np.log(fx) - (1 - self.y[i]) @ np.log(1 - fx)
        return loss
    
    def step(self, steps=1, log=True):
        train_loss = [self._step(i, steps) for i in range(4)]
        val_accur = []
        for i in range(4):
            y_pred = np.rint(sigmoid(self.Xval[i] @ self.w[i])).astype(np.int8)
            val_accur.append(np.count_nonzero(y_pred == self.yval[i]) / (len(self.yval[i]) + 1e-10))
            
        if log:
            print('train loss', np.mean(train_loss), train_loss)
            print('validation accuracy', np.mean(val_accur[1:]), val_accur)
            print(self.w[0])
            print()
            
        self.step_ctr += steps
        self.train_loss = train_loss
        self.val_accur = val_accur

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [4]:
G.df_train_full = pd.read_csv('X_train', dtype=np.int32)
G.df_test_full = pd.read_csv('X_test', dtype=np.int32)
G.y_train = np.array(open('Y_train').read().strip('\n').split('\n'), dtype=np.int8)

In [5]:
def extract(df):
    c = len(df.columns)
    n = len(df)
    d = 1 + c + 5

    X = np.zeros((n, d))
    X[:, 0] = 1  # bias
    X[:, 1:1+c] = df.values

    # ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    idx = [1,2,4,5,6]
    X[:, 1+c:1+c+5] = X[:, idx] ** 2

    return X


def preprocess_train(df):
    df = df.copy()
    col_mean = df.mean()
    col_std = df.std()

    # normalize
    normcols = ['age', 'fnlwgt', 'hours_per_week']
    normcols += ['capital_gain', 'capital_loss']
    df[normcols] = (df[normcols] - df[normcols].mean()) / df[normcols].std()
#     scale_down_cols = ['capital_gain', 'capital_loss']
#     df[scale_down_cols] = df[scale_down_cols] / df[scale_down_cols].mean()

    return extract(df), normcols, col_mean, col_std

G.X_train, G.normcols, G.col_mean, G.col_std = preprocess_train(G.df_train_full)

In [15]:
# np.savetxt('a.csv',G.X_train,fmt='%.2f',delimiter=',')
#print(G.X_train[:10, :10])

In [7]:
# G.model = joblib.load('1/49.pkl')

In [57]:
G.model = LogisticRegression()
G.model.setdata(G.X_train, G.y_train)
num_iter = 50
for i in range(num_iter):
    print('i', i)
    G.model.step(200, True)
    joblib.dump(G.model, '1/'+str(i)+'.pkl')

i 0




train loss nan [nan, nan, nan, nan]
validation accuracy 0.847394119627802 [0.0, 0.8471531232725185, 0.8477059148700862, 0.8473233207408012]
[-2.51  0.83  0.09  0.5  -0.31  0.09  0.23  0.14 -0.65 -0.03  0.2   0.16
 -1.22 -0.54 -0.2  -0.37 -1.38 -1.32 -0.89 -1.22 -1.76 -2.35 -1.8   0.36
  0.23  1.18  3.17  0.13  1.35 -0.71  2.4   0.09 -0.91  0.69  1.11 -1.02
 -0.87 -1.39 -0.13  0.07 -0.09 -0.26  0.65 -0.98 -0.58 -0.52 -0.38 -0.63
  0.19  0.27  0.37  0.09 -0.34 -0.39 -0.32 -0.18 -1.32 -1.58 -0.53  1.42
 -1.29 -0.17 -0.   -0.91 -0.14  0.3   0.43 -0.55 -0.68  0.34 -0.6  -0.07
 -0.43  0.37  0.28  0.55 -0.43 -0.09 -0.06 -0.01 -0.03  0.01  0.03 -0.32
 -0.02  0.22  0.53  0.02  0.33 -0.06 -1.38 -0.25 -0.14 -0.17  0.76  0.04
 -0.09 -0.54 -0.01 -0.64  0.16 -0.07 -0.1   0.33 -0.49  0.17 -0.14 -0.56
 -0.06  4.14  0.08 -0.14]

i 1
train loss nan [nan, nan, nan, nan]
validation accuracy 0.8535057783691529 [0.0, 0.8525889073152676, 0.8527731711811235, 0.8551552566110674]
[-2.13  0.76  0.08  0.89  0.42 

i 10
train loss 7494.394894621808 [10010.026886241196, 6624.889347532362, 6654.331244056241, 6688.332100657433]
validation accuracy 0.855194928889075 [0.0, 0.8521282476506279, 0.8561820526994578, 0.8572744863171394]
[-2.3   0.79  0.09  0.88  2.52  0.05  0.4   0.49 -0.17 -0.44  0.07  0.28
 -0.35 -0.28 -2.07  0.17 -0.71 -0.62 -0.29 -1.28 -0.98 -1.12 -0.91  0.52
  0.53  1.15  2.26  0.03  1.46 -4.8   2.08  0.38 -1.17  2.12  1.15 -1.11
 -1.37 -1.28 -0.63  0.12 -0.79  0.14  0.9  -0.75 -0.54 -0.22 -0.66 -2.88
  0.64  0.8   0.43  0.78  0.01 -0.26 -0.68 -0.08 -0.93 -1.08 -0.29  0.75
 -0.83 -0.14 -0.47 -0.63 -0.24  1.32  0.52 -0.59 -1.88  0.55 -1.6   0.19
 -0.52  0.59  0.72  0.62 -0.82 -0.06  0.08 -0.06 -0.53  0.15  0.22 -0.23
  0.15  0.91  0.9   0.2   0.49 -0.34 -0.41 -0.57 -1.49 -0.62  0.55  0.27
  0.19 -0.28  0.01 -0.88  0.26 -0.25 -0.32  0.37 -0.93  0.83 -0.04 -0.42
 -0.01 -0.13  0.05 -0.08]

i 11
train loss 7494.091568481148 [10009.666496838783, 6624.722771494544, 6653.885822841925, 6688.09

i 20
train loss 7495.335591811208 [10013.211371980018, 6628.702564993711, 6652.138585205531, 6687.28984506557]
validation accuracy 0.8553484934293886 [0.0, 0.8523125115164838, 0.8560899207665298, 0.857643048005152]
[-2.55  0.79  0.09  0.89  2.53  0.05  0.4   0.63 -0.03 -0.88  0.22  0.42
 -0.21 -0.14 -3.05  0.49 -0.57 -0.47 -0.14 -1.14 -0.84 -0.98 -0.77  0.67
  0.68  1.3   2.41  0.18  1.61 -7.24  2.23  0.53 -1.21  2.1   1.13 -1.15
 -1.41 -1.32 -0.67  0.17 -0.98  0.18  0.95 -0.7  -0.49 -0.18 -0.61 -3.39
  0.69  0.85  0.47  0.83  0.06 -0.39 -0.74 -0.11 -0.95 -1.12 -0.32  0.7
 -0.88 -0.19 -0.52 -0.68 -0.28  1.36  0.55 -0.56 -1.84  0.58 -1.62  0.22
 -0.49  0.62  0.75  0.66 -0.79 -0.02  0.12 -0.15 -0.96  0.19  0.25 -0.2
  0.19  0.94  0.94  0.24  0.52 -0.3  -0.35 -0.53 -2.47 -0.59  0.58  0.3
  0.23 -0.25  0.04 -0.85  0.29 -0.21 -0.28  0.41 -0.89  0.87 -0.01 -0.42
 -0.01 -0.13  0.06 -0.08]

i 21
train loss 7494.7402859694885 [10010.26691601196, 6628.944072045899, 6652.396820136721, 6687.353335

i 30
train loss nan [10013.997655455052, 6624.1197032871205, nan, 6687.090302236726]
validation accuracy 0.855164215415074 [0.0, 0.8521282476506279, 0.8561820526994578, 0.8571823458951363]
[-2.65  0.79  0.09  0.88  2.53  0.04  0.4   0.71  0.05 -1.16  0.29  0.5
 -0.13 -0.06 -3.52  0.67 -0.52 -0.42 -0.1  -1.09 -0.79 -0.93 -0.72  0.72
  0.73  1.35  2.46  0.23  1.66 -8.09  2.28  0.58 -1.23  2.09  1.11 -1.17
 -1.42 -1.34 -0.69  0.17 -0.98  0.19  0.95 -0.69 -0.49 -0.17 -0.61 -3.45
  0.69  0.85  0.48  0.83  0.06 -0.49 -0.76 -0.13 -0.97 -1.14 -0.34  0.68
 -0.9  -0.21 -0.54 -0.7  -0.3   1.38  0.56 -0.54 -1.83  0.6  -1.61  0.24
 -0.48  0.63  0.76  0.67 -0.78 -0.    0.13 -0.24 -1.09  0.2   0.27 -0.19
  0.2   0.96  0.95  0.25  0.54 -0.28 -0.34 -0.51 -2.98 -0.57  0.59  0.31
  0.24 -0.24  0.06 -0.83  0.31 -0.19 -0.27  0.42 -0.88  0.88  0.01 -0.42
 -0.01 -0.13  0.04 -0.08]

i 31
train loss nan [10011.409324159657, 6623.943264166245, nan, 6687.5327502498985]
validation accuracy 0.8552870693110782 [0.0

i 40
train loss nan [10014.6196615714, 6626.386813080211, nan, 6687.115932376668]
validation accuracy 0.8552563586667689 [0.0, 0.8522203795835558, 0.8559977888336019, 0.8575509075831488]
[-2.7   0.79  0.09  0.89  2.53  0.05  0.4   0.76  0.1  -1.35  0.35  0.55
 -0.08 -0.01 -3.82  0.79 -0.49 -0.39 -0.07 -1.06 -0.77 -0.9  -0.69  0.75
  0.75  1.38  2.48  0.26  1.68 -8.55  2.31  0.61 -1.24  2.08  1.11 -1.17
 -1.43 -1.35 -0.7   0.17 -0.98  0.19  0.95 -0.69 -0.49 -0.17 -0.61 -3.45
  0.69  0.85  0.48  0.83  0.06 -0.56 -0.77 -0.14 -0.98 -1.15 -0.35  0.67
 -0.91 -0.22 -0.55 -0.71 -0.31  1.39  0.57 -0.53 -1.82  0.61 -1.6   0.25
 -0.47  0.64  0.77  0.68 -0.77  0.01  0.14 -0.32 -1.13  0.21  0.28 -0.18
  0.21  0.97  0.96  0.26  0.55 -0.27 -0.32 -0.5  -3.31 -0.56  0.6   0.32
  0.25 -0.23  0.07 -0.82  0.32 -0.18 -0.26  0.43 -0.87  0.89  0.02 -0.42
 -0.01 -0.13  0.06 -0.08]

i 41
train loss nan [10012.868394338631, 6626.797697642041, nan, 6687.940350624745]
validation accuracy 0.8552256423630761 [0.0, 

In [8]:
def preprocess_test(df, normcols, col_mean, col_std):
    df = df.copy()
    
    # normalize
    for cnam in normcols:
        df[cnam] = (df[cnam] - col_mean[cnam]) / col_std[cnam]
    
    return extract(df)

G.X_test = preprocess_test(G.df_test_full, G.normcols, G.col_mean, G.col_std)

In [9]:
def test(X, w):
    y = sigmoid(X @ w)
    print(y[:1000])
    y = np.rint(y).astype(np.int8)
    return y

G.y_test = test(G.X_test, G.model.w[0])#joblib.load('logistic-85589.pkl'))

[0.   0.16 0.31 0.83 0.   0.01 0.01 0.87 0.   0.04 0.68 0.6  0.01 0.25
 0.6  0.86 0.   0.34 0.01 0.83 0.65 0.   0.   0.03 0.37 0.87 0.   0.01
 0.3  0.07 0.91 0.01 0.05 0.17 0.01 0.21 0.6  0.   0.   0.   0.76 0.59
 0.14 0.04 0.   0.41 0.06 0.77 0.   0.18 0.   0.01 0.   0.38 0.04 0.
 0.28 0.86 0.9  0.12 0.09 0.11 0.06 0.   0.03 0.02 0.99 0.   0.07 0.42
 1.   0.62 0.06 0.   0.06 0.   0.26 0.23 0.35 0.   0.42 0.08 0.03 1.
 0.2  0.02 0.   0.51 0.01 0.86 0.1  0.01 0.8  0.27 0.04 0.   0.02 0.1
 0.32 0.   0.28 0.01 0.   0.01 0.03 0.01 0.4  0.02 0.   0.   0.   0.18
 0.41 0.   0.22 0.01 0.32 0.   0.03 0.1  0.08 0.   0.01 0.07 0.01 0.01
 0.95 0.01 0.19 0.97 0.42 0.   0.   0.89 0.35 0.03 0.41 0.09 0.2  0.02
 0.76 0.   0.94 0.85 0.03 0.06 0.09 0.25 1.   0.02 0.   0.03 0.05 0.01
 0.78 0.   0.01 0.03 0.01 0.33 0.07 0.23 0.09 0.04 0.77 0.39 0.62 0.35
 0.   0.1  0.79 0.22 0.02 0.48 0.06 0.54 0.13 0.25 0.35 0.59 0.39 1.
 0.03 0.3  0.03 0.08 0.05 0.01 0.03 0.   0.88 0.29 0.04 0.01 0.01 0.03
 0.1  0.94 0.

In [10]:
df_pred = pd.DataFrame({
    'id': np.arange(1, len(G.X_test)+1),
    'label': G.y_test
})
df_pred.to_csv('submission.csv', index=False)
df_pred['label'].values[:1000]

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,