In [62]:
%load_ext autoreload
%autoreload 2

numerical_features = ["age", "fnlwgt", "education-num",  "capital-gain",
                      "capital-loss", "hours-per-week"]
categorical_features = ["workclass", "education","marital-status", "occupation", 
                      "relationship", "race", "sex", "native-country",]

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def encode_categorical_features(df):
    ret = {}
    for col in categorical_features:
        encoder = LabelEncoder()
        encoder.fit(df[col])
        df[col] = encoder.transform(df[col])
        ret[col] = encoder
    return ret
 
def encode_categorical_features_for_test(df, encoders):
    for col in categorical_features:
        encoder = encoders[col]
        df[col] = encoder.transform(df[col])

train = pd.read_csv('../data/adult/train.csv', names=["age", "workclass", "fnlwgt", "education", 
                                                "education-num", "marital-status", "occupation", 
                                                "relationship", "race", "sex", "capital-gain",
                                                "capital-loss", "hours-per-week",
                                                "native-country", "target"],
                    index_col=False, sep=", ")
encoders = encode_categorical_features(train)

test = pd.read_csv('../data/adult/test.csv', names=["age", "workclass", "fnlwgt", "education", 
                                                "education-num", "marital-status", "occupation", 
                                                "relationship", "race", "sex", "capital-gain",
                                                "capital-loss", "hours-per-week",
                                                "native-country", "target"],
                   index_col=False, sep=", ")
test["target"] = test.target.apply(lambda s: s.rstrip('.'))
encode_categorical_features_for_test(test, encoders)


y_train = train.target.values
del train['target']
X_train = train.as_matrix()

y_test = test.target.values
del test['target']
X_test = test.as_matrix()


one_hot = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13],
                        sparse=False,
                        handle_unknown="ignore")
X_train = one_hot.fit_transform(X_train)
X_test = one_hot.transform(X_test)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [63]:
def feature_map(label_encoders, one_hot_encoder):
    def get_categorical_index(feature_range, f):
        for i in xrange(len(feature_range)):
            if feature_range[i] <= f < feature_range[i+1]:
                return i
    
    ret = {}
    feature_range = one_hot_encoder.feature_indices_
    for f in one_hot_encoder.active_features_:
        categorical_idx = get_categorical_index(feature_range, f)
        column_name = categorical_features[categorical_idx]
        label_encoder = label_encoders[column_name]
        feature_value = label_encoder.inverse_transform(f - feature_range[categorical_idx])
        ret[f] = "%s=%s" % (column_name, feature_value)
    for i, col in enumerate(numerical_features):
        ret[f+i] = col
    return ret
        

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from simple_ml.logistic_regression import LogisticRegression as LR
#est = RandomForestClassifier(n_estimators=100, oob_score=True)

from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

est = LR(learning_rate=1e-3, C = 1000, print_every = 100, verbose = True)
est.fit(X_train, y_train)
yp = est.predict(X_test)

print f1_score(y_test, yp, pos_label='>50K'), \
    precision_score(y_test, yp, pos_label='>50K'), \
    recall_score(y_test, yp, pos_label='>50K'), accuracy_score(y_test, yp)

epoch: 100, loss: 361.040911, diff: 0.161725
epoch: 200, loss: 350.816491, diff: 0.067583
epoch: 300, loss: 345.678662, diff: 0.039517
epoch: 400, loss: 342.460016, diff: 0.026397
epoch: 500, loss: 340.219744, diff: 0.019123
epoch: 600, loss: 338.548295, diff: 0.014686
epoch: 700, loss: 337.235173, diff: 0.011798
epoch: 800, loss: 336.161036, diff: 0.009822
epoch: 900, loss: 335.253525, diff: 0.008417
epoch: 1000, loss: 334.466524, diff: 0.007383
epoch: 1100, loss: 333.769480, diff: 0.006600
epoch: 1200, loss: 333.141496, diff: 0.005990
epoch: 1300, loss: 332.567903, diff: 0.005504
epoch: 1400, loss: 332.038160, diff: 0.005108
epoch: 1500, loss: 331.544544, diff: 0.004778
epoch: 1600, loss: 331.081291, diff: 0.004498
epoch: 1700, loss: 330.644029, diff: 0.004256
epoch: 1800, loss: 330.229387, diff: 0.004044
epoch: 1900, loss: 329.834725, diff: 0.003855
epoch: 2000, loss: 329.457944, diff: 0.003685
epoch: 2100, loss: 329.097347, diff: 0.003531
epoch: 2200, loss: 328.751542, diff: 0.0033

In [73]:
est = LR(learning_rate=0.01, C = 30, verbose = False)
idx = range(len(y_train))
for epoch in xrange(50000):
    import random; random.shuffle(idx)
    loss = est.partial_fit(X_train[idx[:128], :], y_train[idx[:128]])
    if epoch % 100 == 0:
        print loss
    
yp = est.predict(X_test)

print f1_score(y_test, yp, pos_label='>50K'), \
    precision_score(y_test, yp, pos_label='>50K'), \
    recall_score(y_test, yp, pos_label='>50K'), accuracy_score(y_test, yp)

20.7944154168
11.4535017499
10.0233401114
8.97699676224
10.8452445361
10.6550673729
8.57766572767
9.21062899891
10.7380562034
9.63277653476
9.66348790479
8.7695429698
12.1047279088
10.5358333314
10.9006779499
12.3478442555
9.47739263528
10.842326165
10.6146828773
11.5933962594
10.0785549001
10.4444584268
9.33245286614
9.29698215483
9.11034281863
9.46452577657
8.9377761077
8.39235757552
10.4610176258
9.6860801238
10.0573128455
9.84414521091
11.3345510217
14.785238066
10.198134043
8.17120299742
9.31543953385
10.8367621158
9.37873654153
11.2377239579
8.00217150307
11.9822410323
9.29134065332
11.4368374591
7.6528908688
11.4106081023
11.7795389236
7.75812388301
11.5637556885
10.1055602551
8.87653736965
11.6817135927
8.94007619641
8.49670942755
9.2494372817
7.25835929972
10.1735209862
10.9486713647
12.6853146998
10.7944595711
12.2601171268
11.9575977162
10.8621571815
9.70828115963
8.26726694781
12.139766925
8.72699769008
9.91870353065
10.8220980615
9.73859844958
12.1625748136
11.4464338802
1

In [72]:
10000/ (len(y_train) / 128)

39

In [35]:
yp = est.predict(X_test)

print f1_score(y_test, yp, pos_label='>50K'), \
    precision_score(y_test, yp, pos_label='>50K'), \
    recall_score(y_test, yp, pos_label='>50K'), accuracy_score(y_test, yp)
    
#print est.oob_score_

0.609186858927 0.711211384936 0.532761310452 0.838523432222


In [36]:
est = LogisticRegression()
est.fit(X_train, y_train)
yp = est.predict(X_test)

print f1_score(y_test, yp, pos_label='>50K'), \
    precision_score(y_test, yp, pos_label='>50K'), \
    recall_score(y_test, yp, pos_label='>50K'), accuracy_score(y_test, yp)

0.651749856569 0.726807421625 0.590743629745 0.850869111234


In [11]:
X_test.shape, yp.shape

((16281, 108), (2,))

In [108]:
one_hot.feature_indices_, X_train.shape

(array([  0,   9,  25,  32,  47,  53,  58,  60, 102]), (32561, 108))

In [109]:
one_hot.active_features_

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101])

In [93]:
np.unique(yp)

array(['<=50K', '>50K'], dtype=object)

In [131]:
encoders['relationship'].inverse_transform([0, 1, 2, 3, 4, 5])

array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
       'Unmarried', 'Wife'], dtype=object)

In [147]:
t = feature_map(encoders, one_hot)
factors = {}
for i in t:
    factors[t[i]] = est.feature_importances_[i]
sorted(factors.items(), key = lambda entry: -entry[1])

[('education-num', 0.16401887115242858),
 ('fnlwgt', 0.14975915822985375),
 ('capital-loss', 0.092639631406943396),
 ('marital-status=Married-civ-spouse', 0.066191952583513153),
 ('capital-gain', 0.060522737999528696),
 ('relationship=Husband', 0.045345842235381004),
 ('hours-per-week', 0.029940768203915905),
 ('marital-status=Never-married', 0.02377248675577362),
 ('occupation=Exec-managerial', 0.018464862874700914),
 ('occupation=Prof-specialty', 0.015157510142159389),
 ('education=Bachelors', 0.012026281923540591),
 ('relationship=Wife', 0.010791971195198244),
 ('workclass=Private', 0.010115619632280635),
 ('education=Masters', 0.0096513375786985676),
 ('relationship=Not-in-family', 0.0096486350722927508),
 ('sex=Female', 0.0092107944906236923),
 ('workclass=Self-emp-not-inc', 0.0083175793058911113),
 ('education=HS-grad', 0.0079101626806830968),
 ('sex=Male', 0.0078782451118840809),
 ('occupation=Other-service', 0.0075570147767558826),
 ('occupation=Sales', 0.0067471481074680949),


In [146]:
factors.items()[:10]

[('native-country=Scotland', 7.7783795610909129e-05),
 ('marital-status=Married-civ-spouse', 0.066191952583513153),
 ('occupation=Armed-Forces', 7.0970759921354901e-06),
 ('education=Bachelors', 0.012026281923540591),
 ('marital-status=Never-married', 0.02377248675577362),
 ('native-country=El-Salvador', 0.00022545060224273239),
 ('native-country=Guatemala', 0.0001304308110601515),
 ('workclass=State-gov', 0.0045157118141743221),
 ('native-country=Cambodia', 0.00030422875680783719),
 ('race=Amer-Indian-Eskimo', 0.00122692168056128)]

In [163]:
X_train[:, 104]

array([ 13.,  13.,   9., ...,   9.,   9.,   9.])