In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

## Load data

In [2]:
def make_int(text):
    label = text.strip('" ')
    return int(-1 if label == '<=50K' else 1)

column_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
train_X = pd.read_csv('data/hw2/train.csv', usecols=[0, 2, 4, 10, 11, 12], names=column_names)
train_Y = pd.read_csv('data/hw2/train.csv', usecols=[14], names=['label'],converters={'label': make_int})

test_X = pd.read_csv('data/hw2/test.csv', usecols=[0, 2, 4, 10, 11, 12], names=column_names)
# test_Y = pd.read_csv('data/hw2/test.csv', usecols=[14], names=['label']).replace('>50K', 1).replace('<=50K', -1)

In [3]:
print(f'train data shape: {train_X.shape}')
print(f'test data shape: {test_X.shape}')
train_X.head()

train data shape: (43957, 6)
test data shape: (4885, 6)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,34,287315,9,0,0,40
1,43,145175,13,0,0,42
2,45,33798,14,0,0,40
3,23,180497,13,0,0,32
4,65,145628,6,0,0,40


In [4]:
train_Y.head()

Unnamed: 0,label
0,-1
1,1
2,-1
3,-1
4,-1


In [5]:
test_X.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,36,126569,10,0,0,40
1,26,68346,14,0,0,10
2,58,225394,9,0,1902,40
3,60,78913,10,0,0,50
4,20,218215,10,0,0,30


## Process data

#### Scale data

In [6]:
scaled_train_X = pd.DataFrame(preprocessing.scale(train_X), columns=column_names)
scaled_test_X = pd.DataFrame(preprocessing.scale(test_X), columns=column_names)

  """Entry point for launching an IPython kernel.
  


In [7]:
scaled_test_X.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,-0.198185,-0.608772,-0.028374,-0.152805,-0.223378,-0.038573
1,-0.926873,-1.177546,1.50318,-0.152805,-0.223378,-2.353443
2,1.404928,0.356638,-0.411262,-0.152805,4.230006,-0.038573
3,1.550665,-1.074318,-0.028374,-0.152805,-0.223378,0.73305
4,-1.364085,0.286507,-0.028374,-0.152805,-0.223378,-0.810197


#### Split data

In [8]:
def split_data(df, ratio):
    set1 = df.sample(frac=ratio)
    set2 = df.sample(frac=(1 - ratio))
    return set1, set2

In [9]:
regularization_train, regularization_validation = split_data(scaled_train_X.join(train_Y), 0.9)

In [10]:
assert regularization_train.shape[0] + regularization_validation.shape[0] == train_X.shape[0]
regularization_train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,label
9816,-1.286399,-0.542022,-1.199565,-0.144172,-0.21644,-0.439197,-1
33902,-0.046329,-0.333539,-0.420255,-0.144172,4.90819,-0.033567,-1
40099,-0.484001,2.269092,-0.420255,-0.144172,-0.21644,0.777693,-1
40804,1.193741,-0.388468,-1.589219,-0.144172,-0.21644,-0.033567,-1
14833,2.28792,-0.127086,-0.420255,-0.144172,-0.21644,-1.412709,-1


In [11]:
reg_validate_X = regularization_validation.drop(['label'], axis=1)
reg_validate_Y = regularization_validation['label']

## Stochastic Gradient Descent

In [91]:
def gradient_of_cost(feature_vec, label, a, b, lam=0):
    if (label * np.dot(feature_vec, a.T) + b) < 1:
        gradient_a = lam * a - (feature_vec * label)
        gradient_b = 0 - label
    else:
        gradient_a = lam * a
        gradient_b = 0
  
    return [gradient_a, gradient_b]
# gradient_of_cost(np.array([2.287920, -0.453098, 0.359055, 1.099318, -0.21644, 0.777693]), label=1, a=np.array([1,2,3,4,5,6]) )

In [92]:
def step(feature_vec, label,a, b, lam, learning_rate):
#   c = cost_function(x_batch,y_batch,a=a_est,b=b_est)
    g = gradient_of_cost(feature_vec, label,a=a,b=b,lam=lam)
    a_new = a - learning_rate * g[0]
    b_new = b - learning_rate * g[1]
    return a_new, b_new

In [97]:
def stochastic_gradient_descent(X, Y, initial_a, initial_b, steps, learning_rate, lam):
    a = initial_a
    b = initial_b

    for i in range(steps):
        n = np.random.randint(0, len(X)) # Batch size of 1.
        a, b = step(X.iloc[n], Y.iloc[n], a, b, lam, learning_rate)

    print(f"Final a,b = {(a, b)}")
    return a, b

In [98]:
def seperate_hold_out(X, size):
    random_indices = np.random.randint(0, len(X), size=size)
    return X.drop(X.index[random_indices]), X.iloc[random_indices]

In [99]:
set_hold_out_size = 50
initial_a = np.random.rand(1,6).flatten() - 1 
initial_b = 0
steps = 300
X, hold_out_set = seperate_hold_out(regularization_train, set_hold_out_size)
assert X.shape[0] + hold_out_set.shape[0] == regularization_train.shape[0]

reg_train_X = X.drop(['label'], axis=1)
reg_train_Y = X['label']

W = stochastic_gradient_descent(reg_train_X, reg_train_Y, initial_a, initial_b, steps, learning_rate=0.1, lam=1)

Final a,b = (age              -0.400580
fnlwgt           -0.126785
education-num     0.053168
capital-gain      0.248423
capital-loss      0.189215
hours-per-week   -0.156674
dtype: float64, -15.79999999999996)


## Regularization constant λ