In [1]:
import pandas as pd
import numpy as np
import joblib
np.set_printoptions(precision=2, suppress=True)

In [2]:
class G:
    # global variables
    pass

In [19]:
class LogisticRegression:
    def __init__(self, b1=0.99, b2=0.999):
        self.n = self.d = 0
        self.b1 = b1
        self.b2 = b2
        self.step_ctr = 0
    
    def __getstate__(self):
        # Copy the object's state from self.__dict__ which contains
        # all our instance attributes. Always use the dict.copy()
        # method to avoid modifying the original state.
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        del state['X']
        del state['XT']
        del state['Xval']
        del state['y']
        del state['yval']
        return state
    
    def setdata(self, X, y):
        self.n, self.d = X.shape

        self.v = [0, 0, 0, 0]
        self.w = [np.zeros(self.d) for _ in range(4)]
        self.m = [np.zeros(self.d) for _ in range(4)]
        
        # 3-fold cross validation sets, 0 is full
        ncv = (self.n + 2) // 3
        mask = [np.zeros(self.n, dtype=np.bool_) for _ in range(4)]
        for i in range(1, 4):
            mask[i][(i-1)*ncv:i*ncv] = True  

        self.X = [X[~mask[i], :] for i in range(4)]
        self.XT = [self.X[i].T for i in range(len(self.X))]
        self.Xval = [X[mask[i], :] for i in range(4)]
        self.y = [y[~mask[i]] for i in range(4)]
        self.yval = [y[mask[i]] for i in range(4)]      

    def _step(self, i, steps):
        for j in range(steps):
            fx = sigmoid(self.X[i] @ self.w[i])
            grad = self.XT[i] @ (fx - self.y[i])
            
            # Adam
            self.m[i] = self.b1 * self.m[i] + (1 - self.b1) * grad
            self.v[i] = self.b2 * self.v[i] + (1 - self.b2) * np.sum(grad ** 2)
            self.w[i] = self.w[i] - self.m[i] / (1 - self.b1) / np.sqrt(self.v[i] / (1 - self.b2))

        loss = - self.y[i] @ np.log(fx) - (1 - self.y[i]) @ np.log(1 - fx)
        return loss

    def step(self, steps=1, log=True):
        train_loss = [self._step(i, steps) for i in range(4)]
        val_accur = []
        for i in range(1, 4):
            y_pred = np.rint(sigmoid(self.Xval[i] @ self.w[i])).astype(np.int8)
            val_accur.append(np.count_nonzero(y_pred == self.yval[i]) / (len(self.yval[i]) + 1e-10))
            
        if log:
            print('train loss', np.mean(train_loss), train_loss)
            print('validation accuracy', np.mean(val_accur[1:]), val_accur)
            print(self.w[0])
            print()
            
        self.step_ctr += steps
        self.train_loss = train_loss
        self.val_accur = val_accur

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [4]:
G.df_train_full = pd.read_csv('X_train', dtype=np.int32)
G.df_test_full = pd.read_csv('X_test', dtype=np.int32)
G.y_train = np.array(open('Y_train').read().strip('\n').split('\n'), dtype=np.int8)

In [22]:
def normalize(df, means, stds):
#     cols = ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    cols = df.columns
    df = df.copy()
    df[cols] = (df[cols] - means[cols]) / stds[cols]
    return df

def extract(df):
    c = len(df.columns)
    n = len(df)
    d = 1 + c #+ 5*5 + 5

    X = np.zeros((n, d), dtype=np.float64)
    X[:, 0] = 1  # bias
    X[:, 1:1+c] = df.values

    # quadratic term (including cross product)
    # ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
#     idx = np.array([1, 2, 4, 5, 6])
#     st = 1 + c
#     en = 1 + c + 5*5
#     X[:, st:en] = (X[:, idx[:, None]] * X[:, idx[None, :]]).reshape((n, 5*5))
    
#     # cubic term
#     st = 1 + c + 25
#     en = 1 + c + 25 + 5
#     X[:, st:en] = X[:, idx] ** 3

    return X

def preprocess(df_train, df_test):
    df_all = pd.concat((df_train, df_test))
    means = df_all.mean()
    stds = df_all.std()

    df_train = normalize(df_train, means, stds)
    df_test = normalize(df_test, means, stds)
    X_train = extract(df_train)
    X_test = extract(df_test)
    
    print('n,d', X_train.shape)

    return X_train, X_test

G.X_train, G.X_test = preprocess(G.df_train_full, G.df_test_full)
# np.savetxt('a.csv',G.X_train,fmt='%.2f',delimiter=',')

n,d (32561, 107)


In [23]:
G.model = LogisticRegression()
G.model.setdata(G.X_train, G.y_train)
num_iter = 50
for i in range(num_iter):
    print('i', i)
    G.model.step(200, True)
#     joblib.dump(G.model, '1/'+str(i)+'.pkl')

i 0




train loss nan [nan, nan, nan, 7276.264864366456]
validation accuracy 0.8465932602753259 [0.8423622627602649, 0.8466924636078786, 0.846494056942773]
[-2.74  0.38  0.01  0.59  2.81  0.3   0.39  0.13 -0.01 -0.75  0.13  0.01
 -0.08 -0.05 -1.7  -0.02 -0.35 -0.25 -0.07 -0.24 -0.07  0.   -0.05  0.08
  0.03  0.37  0.23  0.01  0.3  -4.22  0.15  0.1   0.04  0.06  0.52 -0.07
 -0.46 -0.06 -0.2   0.08  0.01  0.1   0.24 -0.18 -0.11 -0.12 -0.35 -1.06
  0.26  0.08  0.09  0.11 -0.02 -0.07  0.07 -0.1  -0.34 -0.14  0.05  0.49
 -0.05  0.02 -0.05 -0.04  0.06  0.05  0.02 -0.02 -0.09  0.08 -0.41 -0.06
  0.04  0.04  0.05  0.02 -0.07  0.04  0.03 -0.32 -0.04  0.02 -0.03  0.04
  0.01 -0.01  0.07 -0.01  0.09 -0.04 -0.01 -0.01 -1.52 -0.02  0.07 -0.
 -0.02 -0.04  0.03 -0.07  0.03  0.08 -0.06  0.16 -0.04  0.04 -0.06]

i 1
train loss nan [nan, nan, nan, 6955.2605687092]
validation accuracy 0.8506472945291856 [0.8474295190713023, 0.849824949327429, 0.8514696397309421]
[-2.62  0.38  0.07  0.39  2.8   0.28  0.4   0.14 

train loss 7698.8621476478165 [10282.510594691603, 6790.435170099066, 6821.716823934049, 6900.786001866547]
validation accuracy 0.8519371500792521 [0.8492721577298613, 0.8522203795835558, 0.8516539205749484]
[-2.37  0.35  0.07  0.41  2.38  0.26  0.37  0.12  0.01 -0.84  0.1   0.07
 -0.07 -0.02 -1.9  -0.03 -0.15 -0.16 -0.05 -0.1  -0.12 -0.19 -0.14  0.08
  0.09  0.38  0.23 -0.04  0.31 -4.86  0.24  0.1  -0.23  0.06  0.77 -0.08
 -0.54 -0.14 -0.09 -0.01 -0.02  0.02  0.25 -0.17 -0.14 -0.07 -0.25 -0.29
  0.17  0.08  0.08  0.11 -0.03 -0.08 -0.08  0.18 -0.09 -0.3   0.09  0.26
 -0.05  0.03 -0.03 -0.03  0.03  0.04  0.03 -0.02 -0.08  0.03 -0.07 -0.
 -0.02  0.03  0.02  0.04 -0.02 -0.    0.01 -0.36 -0.02  0.    0.   -0.01
  0.01  0.02  0.05  0.01  0.03 -0.01 -0.04 -0.02 -1.7  -0.02  0.05  0.01
  0.01 -0.01  0.   -0.04  0.01 -0.01 -0.    0.13 -0.04  0.02  0.01]

i 11
train loss 7698.862146126678 [10282.510593197734, 6790.435168620346, 6821.716822267511, 6900.786000421122]
validation accuracy 0.8519371

train loss 7698.862145857265 [10282.510592853887, 6790.435168386377, 6821.716822029647, 6900.786000159152]
validation accuracy 0.8519832160457161 [0.8492721577298613, 0.8523125115164838, 0.8516539205749484]
[-2.37  0.35  0.07  0.41  2.38  0.26  0.37  0.12  0.01 -0.84  0.1   0.07
 -0.07 -0.02 -1.9  -0.03 -0.15 -0.16 -0.05 -0.1  -0.12 -0.19 -0.14  0.08
  0.09  0.38  0.23 -0.04  0.31 -4.86  0.24  0.1  -0.23  0.06  0.77 -0.08
 -0.54 -0.14 -0.09 -0.01 -0.02  0.02  0.25 -0.17 -0.14 -0.07 -0.25 -0.29
  0.17  0.08  0.08  0.11 -0.03 -0.08 -0.08  0.18 -0.09 -0.3   0.09  0.26
 -0.05  0.03 -0.03 -0.03  0.03  0.04  0.03 -0.02 -0.08  0.03 -0.07 -0.
 -0.02  0.03  0.02  0.04 -0.02 -0.    0.01 -0.36 -0.02  0.    0.   -0.01
  0.01  0.02  0.05  0.01  0.03 -0.01 -0.04 -0.02 -1.7  -0.02  0.05  0.01
  0.01 -0.01  0.   -0.04  0.01 -0.01 -0.    0.13 -0.04  0.02  0.01]

i 21
train loss 7698.862145857234 [10282.510592853852, 6790.435168386293, 6821.716822029646, 6900.786000159149]
validation accuracy 0.85198321

train loss 7698.873332434666 [10282.534683053611, 6790.441285974624, 6821.730392668929, 6900.786968041499]
validation accuracy 0.851706811757857 [0.8494564215957172, 0.851943983784772, 0.8514696397309421]
[-2.37  0.35  0.07  0.41  2.38  0.26  0.37  0.12  0.01 -0.84  0.1   0.07
 -0.07 -0.02 -1.9  -0.03 -0.15 -0.16 -0.05 -0.1  -0.12 -0.19 -0.14  0.08
  0.09  0.38  0.23 -0.04  0.31 -4.86  0.24  0.1  -0.23  0.06  0.76 -0.08
 -0.54 -0.14 -0.09 -0.01 -0.02  0.02  0.25 -0.17 -0.14 -0.07 -0.25 -0.29
  0.17  0.08  0.08  0.11 -0.03 -0.08 -0.08  0.18 -0.09 -0.3   0.09  0.26
 -0.05  0.03 -0.03 -0.03  0.03  0.04  0.03 -0.02 -0.08  0.03 -0.07 -0.
 -0.02  0.03  0.02  0.04 -0.02 -0.    0.01 -0.36 -0.02  0.    0.   -0.01
  0.01  0.02  0.05  0.01  0.03 -0.01 -0.04 -0.02 -1.7  -0.02  0.05  0.01
  0.01 -0.01  0.   -0.04  0.01 -0.01 -0.    0.13 -0.04  0.02  0.01]

i 31
train loss 7698.864407381008 [10282.514247374147, 6790.437095202043, 6821.71997814284, 6900.786308805002]
validation accuracy 0.85184501390

train loss 7698.954713577719 [10282.698735764643, 6790.515101992634, 6821.717932885746, 6900.887083667853]
validation accuracy 0.8518450139017866 [0.8490878938640054, 0.8521282476506279, 0.8515617801529453]
[-2.37  0.35  0.07  0.4   2.38  0.26  0.37  0.12  0.01 -0.84  0.1   0.07
 -0.07 -0.02 -1.9  -0.03 -0.15 -0.16 -0.05 -0.1  -0.12 -0.19 -0.14  0.08
  0.09  0.38  0.23 -0.04  0.31 -4.86  0.24  0.1  -0.23  0.06  0.76 -0.08
 -0.54 -0.14 -0.09 -0.01 -0.02  0.02  0.25 -0.17 -0.14 -0.07 -0.25 -0.29
  0.17  0.08  0.08  0.11 -0.03 -0.08 -0.08  0.18 -0.09 -0.3   0.09  0.26
 -0.05  0.03 -0.03 -0.03  0.03  0.04  0.03 -0.02 -0.08  0.03 -0.07 -0.
 -0.02  0.03  0.02  0.04 -0.02 -0.    0.01 -0.36 -0.02  0.    0.   -0.01
  0.01  0.02  0.05  0.01  0.03 -0.01 -0.04 -0.02 -1.7  -0.02  0.05  0.01
  0.01 -0.01  0.   -0.04  0.01 -0.01 -0.    0.13 -0.04  0.02  0.01]

i 41
train loss 7699.124425336464 [10283.319500023184, 6790.635333442023, 6821.750996237356, 6900.791871643294]
validation accuracy 0.85198321

In [21]:
def test(X, w):
    y = sigmoid(X @ w)
    print(y[:100])
    y = np.rint(y).astype(np.int8)
    return y

G.y_test = test(G.X_test, G.model.w[0])
df_pred = pd.DataFrame({
    'id': np.arange(1, len(G.X_test)+1),
    'label': G.y_test
})
df_pred.to_csv('submission.csv', index=False)
df_pred['label'].values[:100]

[0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0.]




array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [27]:
yyy = joblib.load('1/49.pkl')
print(yyy.w[0])
import pickle
pickle.dump(yyy.w[0], open('log/1.pkl','wb'))

[-2.74  0.79  0.09  0.89  2.53  0.05  0.4   0.8   0.14 -1.48  0.38  0.59
 -0.05  0.03 -4.02  0.88 -0.47 -0.38 -0.05 -1.04 -0.75 -0.89 -0.67  0.76
  0.77  1.39  2.5   0.27  1.7  -8.84  2.32  0.62 -1.24  2.07  1.1  -1.18
 -1.44 -1.35 -0.7   0.17 -0.98  0.19  0.96 -0.69 -0.48 -0.17 -0.61 -3.45
  0.69  0.85  0.48  0.83  0.06 -0.6  -0.77 -0.14 -0.98 -1.15 -0.35  0.66
 -0.92 -0.23 -0.55 -0.72 -0.32  1.4   0.58 -0.52 -1.81  0.61 -1.6   0.25
 -0.46  0.65  0.78  0.69 -0.76  0.01  0.15 -0.39 -1.13  0.22  0.29 -0.17
  0.22  0.98  0.97  0.27  0.56 -0.27 -0.32 -0.5  -3.54 -0.56  0.61  0.33
  0.26 -0.22  0.08 -0.82  0.33 -0.17 -0.25  0.44 -0.86  0.9   0.03 -0.42
 -0.01 -0.13  0.06 -0.08]
