In [87]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate

Importing dataset

In [15]:
dataset_raw = pd.read_csv("dataset_183_adult.csv",header=0)
print(dataset_raw.head(10))

   age         workclass  fnlwgt  education  education-num  \
0    2         State-gov   77516  Bachelors             13   
1    3  Self-emp-not-inc   83311  Bachelors             13   
2    2           Private  215646    HS-grad              9   
3    3           Private  234721       11th              7   
4    1           Private  338409  Bachelors             13   
5    2           Private  284582    Masters             14   
6    3           Private  160187        9th              5   
7    3  Self-emp-not-inc  209642    HS-grad              9   
8    1           Private   45781    Masters             14   
9    2           Private  159449  Bachelors             13   

          marital-status         occupation   relationship   race     sex  \
0          Never-married       Adm-clerical  Not-in-family  White    Male   
1     Married-civ-spouse    Exec-managerial        Husband  White    Male   
2               Divorced  Handlers-cleaners  Not-in-family  White    Male   
3     Mar

Exploring unique values in dataset

In [30]:
for col in dataset_raw.columns:
    print("Column: " + col)
    print("Number of unique values: " + str(pd.unique(dataset_raw[col]).shape[0]))
    print(pd.unique(dataset_raw[col]))
    print("\n")

Column: age
Number of unique values: 5
[2 3 1 0 4]


Column: workclass
Number of unique values: 9
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']


Column: fnlwgt
Number of unique values: 28523
[ 77516  83311 215646 ... 173449  89686 350977]


Column: education
Number of unique values: 16
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']


Column: education-num
Number of unique values: 16
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]


Column: marital-status
Number of unique values: 7
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']


Column: occupation
Number of unique values: 15
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 

In [107]:
print("FnlWgt: Continuous data stats")
print("Min: " + str(dataset_raw['fnlwgt'].min()))
print("Max: " + str(dataset_raw['fnlwgt'].max()))
print("Mean: " + str(dataset_raw['fnlwgt'].mean()))
print("Variance: " + str(dataset_raw['fnlwgt'].var()))

FnlWgt: Continuous data stats
Min: 12285
Max: 1490400
Mean: 189664.13459727284
Variance: 11152210185.574848


Since the dataset has mostly categorical data we will need to factorize them. Age, capitalloss, capitalgain, hoursperweek and education-num are factorized so we don't need to touch them. Education and education-num have a 1-1 relationship where education num is the factorized version so we can remove that column.
On the other hand fnlwgt is a continuous number, so depending on the algorithm we are using we will need to normalize using either z-score or min-max

Factorizing the categorical data

In [60]:
factorized_all = dataset_raw.loc[:, dataset_raw.columns != 'education'].copy()
to_factorize = ['class','native-country', 'sex', 'race', 'relationship', 'occupation', 'marital-status', 'workclass']
stacked = factorized_all[to_factorize].stack()
factorized_all[to_factorize] = pd.Series(stacked.factorize()[0], index=stacked.index).unstack().rank(method='dense')

In [61]:
for col in factorized_all.columns:
    print("Column: " + col)
    print("Number of unique values: " + str(pd.unique(factorized_all[col]).shape[0]))
    print(pd.unique(factorized_all[col]))
    print("\n")

Column: age
Number of unique values: 5
[2 3 1 0 4]


Column: workclass
Number of unique values: 9
[1. 2. 3. 5. 6. 4. 7. 8. 9.]


Column: fnlwgt
Number of unique values: 28523
[ 77516  83311 215646 ... 173449  89686 350977]


Column: education-num
Number of unique values: 16
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]


Column: marital-status
Number of unique values: 7
[1. 2. 3. 4. 5. 6. 7.]


Column: occupation
Number of unique values: 15
[ 1.  2.  3.  4.  5.  6.  8.  9. 10. 11. 12.  7. 13. 14. 15.]


Column: relationship
Number of unique values: 6
[1. 2. 3. 4. 5. 6.]


Column: race
Number of unique values: 5
[1. 2. 3. 4. 5.]


Column: sex
Number of unique values: 2
[1. 2.]


Column: capitalgain
Number of unique values: 5
[1 0 4 2 3]


Column: capitalloss
Number of unique values: 5
[0 3 1 2 4]


Column: hoursperweek
Number of unique values: 5
[2 0 3 4 1]


Column: native-country
Number of unique values: 42
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 

Now that the data is factorized we need to min max normalize all features

In [80]:
normal_min_max = factorized_all.copy().values
min_max_scaler = preprocessing.MinMaxScaler()
dataset_scaled = min_max_scaler.fit_transform(normal_min_max)
print(dataset_scaled.shape)
print(dataset_scaled[0:20,:])

(48842, 14)
[[0.5        0.         0.04413121 0.8        0.         0.
  0.         0.         0.         0.25       0.         0.5
  0.         0.        ]
 [0.75       0.125      0.04805174 0.8        0.16666667 0.07142857
  0.2        0.         0.         0.         0.         0.
  0.         0.        ]
 [0.5        0.25       0.13758131 0.53333333 0.33333333 0.14285714
  0.         0.         0.         0.         0.         0.5
  0.         0.        ]
 [0.75       0.25       0.15048626 0.4        0.16666667 0.14285714
  0.2        0.25       0.         0.         0.         0.5
  0.         0.        ]
 [0.25       0.25       0.22063507 0.8        0.16666667 0.21428571
  0.4        0.25       1.         0.         0.         0.5
  0.02439024 0.        ]
 [0.5        0.25       0.18421909 0.86666667 0.16666667 0.07142857
  0.4        0.         1.         0.         0.         0.5
  0.         0.        ]
 [0.75       0.25       0.10006123 0.26666667 0.5        0.28571429
  0. 

Now that the dataset has been factorized and normalized, we can build our models

Logistic Regression

In [85]:
kf10 = KFold(n_splits=10)
logreg_acc = []
logreg_pres = []
logreg_rec = []
logreg_err = []
logreg_f1 = []
for train_index, test_index in kf10.split(dataset_scaled):
    train, test = dataset_scaled[train_index], dataset_scaled[test_index]
    logreg = LogisticRegression(solver="liblinear").fit(train[:,0:12],train[:,13])
    predictions = logreg.predict(test[:,0:12])
    c_matrix = confusion_matrix(test[:,13], predictions)
    print(abs(logreg.coef_[0]))
    print(c_matrix)
    print("\n")

[[ 1.89543554  0.73869571  0.701705    4.64635296 -0.44199952 -0.53662057
  -0.32304578 -0.62108266 -1.1393819   3.73196017  1.9657544   1.97384637]]
[[3489  211]
 [ 668  517]]


[[ 1.92143488  0.70799386  0.61698928  4.57713281 -0.46949232 -0.54824568
  -0.30657991 -0.56167111 -1.11227278  3.74248149  1.96245776  2.02881641]]
[[3536  211]
 [ 627  511]]




[[ 1.92295595  0.72845244  0.5724042   4.6555245  -0.48000053 -0.55234459
  -0.30072097 -0.62360661 -1.1169472   3.73292237  1.89639308  2.00605045]]
[[3496  216]
 [ 630  542]]


[[ 1.90846906  0.70968099  0.56749797  4.58761166 -0.47336676 -0.52040874
  -0.33160547 -0.62553811 -1.11549888  3.70362643  1.95766793  1.99691441]]
[[3512  204]
 [ 640  528]]




[[ 1.91810525  0.71499693  0.6150228   4.5612632  -0.45807488 -0.51244556
  -0.33437879 -0.63019994 -1.09801183  3.73758031  1.97990941  1.99027036]]
[[3489  205]
 [ 646  544]]


[[ 1.902086    0.76329928  0.59840946  4.58961277 -0.44254548 -0.54945709
  -0.31553012 -0.62256356 -1.12141568  3.7248896   1.99318434  1.94390391]]
[[3503  197]
 [ 635  549]]




[[ 1.9259821   0.70387513  0.63530044  4.61164599 -0.51531152 -0.50153855
  -0.31981617 -0.65942112 -1.11858674  3.69900817  1.89203617  1.99513408]]
[[3469  213]
 [ 654  548]]


[[ 1.88607905  0.68629871  0.65887836  4.61892365 -0.41213508 -0.50537485
  -0.31758421 -0.68087193 -1.1120942   3.70365163  1.93609642  1.9767217 ]]
[[3554  191]
 [ 639  500]]




[[ 1.91536214  0.66897872  0.67071626  4.59934647 -0.42184088 -0.54277734
  -0.30385062 -0.65002378 -1.13052646  3.71410411  1.98728699  2.00340947]]
[[3516  221]
 [ 642  505]]


[[ 1.92604651  0.72721285  0.53901366  4.5929849  -0.46522622 -0.53620003
  -0.32408365 -0.63830527 -1.10364229  3.71863598  1.98039604  1.99912455]]
[[3510  212]
 [ 631  531]]




In [109]:
logreg_1 = LogisticRegression(solver="liblinear")
print("Logistic Regression Metrics: k=10")
scoring_metrics = {'precision': 'test_precision', 'recall': 'test_recall', 'f1': 'test_recall', 'accuracy': 'test_accuracy'}
scores = cross_validate(logreg_1, dataset_scaled[:,0:12], dataset_scaled[:,13], cv=10, scoring=list(scoring_metrics.keys()), return_estimator=True)
for metric in scoring_metrics.keys():
    print("Average " + metric + ": " + str(sum(scores[scoring_metrics[metric]])/ len(scores[scoring_metrics[metric]])))
log_importances = []
for estimator in scores['estimator']:
    feature_importance = abs(estimator.coef_[0])
    feature_importance = 100 * (feature_importance / feature_importance.max())
    log_importances.append(feature_importance)

Logistic Regression Metrics: k=10


Average precision: 0.7169721193413964
Average recall: 0.45135440957615114
Average f1: 0.45135440957615114
Average accuracy: 0.8260717885209798


In [105]:
scores['estimator'][0].coef_[0]

1.895295175913726

In [112]:
print(log_importances)
print(type(log_importances))
print(log_importances[0])
print(type(log_importances[0]))

[array([ 40.81149705,  15.9780503 ,  14.97457107, 100.        ,
         9.53176934,  11.53346939,   6.97655703,  13.48723154,
        24.52808499,  80.3269876 ,  42.41666731,  42.62507052]), array([ 41.99237116,  15.27239379,  13.73597998, 100.        ,
        10.24027845,  11.94258235,   6.63836233,  12.11551681,
        24.29629167,  81.79710235,  42.83792028,  44.19666634]), array([ 41.2884388 ,  15.79741365,  12.24060364, 100.        ,
        10.31027596,  11.82501274,   6.47854174,  13.39624065,
        23.9584309 ,  80.15606959,  40.69440014,  43.02856368]), array([ 41.57305242,  15.37849509,  12.40151946, 100.        ,
        10.26605786,  11.45690773,   7.22816712,  13.62284808,
        24.33490938,  80.58630215,  42.54029139,  43.49800111]), array([ 42.04161082,  15.66472205,  13.24419118, 100.        ,
        10.11824709,  11.20608041,   7.35820632,  13.85165695,
        24.06578217,  81.96629895,  43.47592246,  43.68589806]), array([ 41.46105576,  16.67053241,  13.30844

In [113]:
log_importances[0] + log_importances[1]

array([ 82.80386821,  31.2504441 ,  28.71055105, 200.        ,
        19.77204778,  23.47605174,  13.61491936,  25.60274835,
        48.82437666, 162.12408995,  85.25458758,  86.82173686])