# Processing Adult Income Dataset with Skale

This notebook will show how we can use Skale to process the Adult Income Dataset. 

First, we will see how to load and explore the raw data. Finally we will build and evaluate the performance of a logistic regression classifier to predict if an adult earns more or less than 50K a year.

Let's require some libraries:

In [1]:
var co = require('co');
var sc = require('skale-engine').context();
var plot = require('plotter').plot;

var CSVDataFrame = require('./CSVDataFrame.js');
var StandardScaler = require('skale-ml').StandardScaler;
var LogisticRegressionWithSGD = require('skale-ml').LogisticRegressionWithSGD;
var BinaryClassificationMetrics = require('skale-ml').BinaryClassificationMetrics;

undefined

# Loading the CSV file

Our data is stored in a csv file named 'adult.data'.  Let's load it and show the first five rows.

In [2]:
var fields = [
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status", 
    "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss", 
    "Hours-per-week", "Country", "Target"
];
var df = new CSVDataFrame(sc, fields, 'adult.data', ',', '?');
$$async$$ = df.show(5, $$done$$)

┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Age        │ Workclass  │ fnlwgt     │ Education  │ Education… │ Marital-S… │ Occupation │ Relations… │ Race       │ Sex        │ Capital-G… │ Capital-L… │ Hours-per… │ Country    │ Target     │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 39         │ State-gov  │ 77516      │ Bachelors  │ 13         │ Never-mar… │ Adm-cleri… │ Not-in-fa… │ White      │ Male       │ 2174       │ 0          │ 40         │ United-St… │ <=50K      │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 50         │ 

null

# Generating Age distribution as png files

In [3]:
$$async$$ = df.describe("Age", $$done$$);

Creating Age.png


null

![title](Age.png)

# Encoding the categorical features

In [4]:
var edf = df.number_encode_features()
$$async$$ = edf.show(5, $$done$$);

┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Age        │ Workclass  │ fnlwgt     │ Education  │ Education… │ Marital-S… │ Occupation │ Relations… │ Race       │ Sex        │ Capital-G… │ Capital-L… │ Hours-per… │ Country    │ Target     │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 39         │ 0          │ 77516      │ 0          │ 13         │ 0          │ 0          │ 0          │ 0          │ 0          │ 2174       │ 0          │ 40         │ 0          │ 0          │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 50         │ 

null

# Correlation between Education and Education-Num'

In [6]:
$$async$$ = df.select(["Education", "Education-Num"]).show(15, $$done$$);

┌────────────┬────────────┐
│ Education  │ Education… │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ HS-grad    │ 9          │
├────────────┼────────────┤
│ 11th       │ 7          │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ Masters    │ 14         │
├────────────┼────────────┤
│ 9th        │ 5          │
├────────────┼────────────┤
│ HS-grad    │ 9          │
├────────────┼────────────┤
│ Masters    │ 14         │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ Some-coll… │ 10         │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ Bachelors  │ 13         │
├────────────┼────────────┤
│ Assoc-acdm │ 12         │
├────────────┼────────────┤
│ 7th-8th    │ 4          │
└────────────┴────────────┘


null

# Delete Education field from data frame

In [7]:
edf = edf.drop(["Education"]);
$$async$$ = edf.show(5, $$done$$)

┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Age        │ Workclass  │ fnlwgt     │ Education… │ Marital-S… │ Occupation │ Relations… │ Race       │ Sex        │ Capital-G… │ Capital-L… │ Hours-per… │ Country    │ Target     │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 39         │ 0          │ 77516      │ 13         │ 0          │ 0          │ 0          │ 0          │ 0          │ 2174       │ 0          │ 40         │ 0          │ 0          │
├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 50         │ 1          │ 83311      │ 13         │ 1          │ 1          │ 

null

# Correlation between Sex and Relationship

In [8]:
$$async$$ = df.select(["Sex", "Relationship"]).show(5, $$done$$);

┌────────────┬────────────┐
│ Sex        │ Relations… │
├────────────┼────────────┤
│ Male       │ Not-in-fa… │
├────────────┼────────────┤
│ Male       │ Husband    │
├────────────┼────────────┤
│ Male       │ Not-in-fa… │
├────────────┼────────────┤
│ Male       │ Husband    │
├────────────┼────────────┤
│ Female     │ Wife       │
└────────────┴────────────┘


null

# Extract a LabeledPoint Dataset from our encoded Data Frame

In [9]:
var training_set = edf.toLabeledPoint("Target", ["*"]);

undefined

# Scale features to zero-mean, unit variance

In [10]:
var scaler = new StandardScaler();
var training_set_std;
$$async$$ = scaler.fit(training_set.map(p => p[1]), function() {
    training_set_std = training_set.map((p, scaler) => [p[0], scaler.transform(p[1])], scaler).persist();
    $$done$$();
});

undefined

# Train logistic regression with SGD on standardized training set

In [11]:
var nIterations = 10;
var parameters = {regParam: 0.01, stepSize: 1};
var model = new LogisticRegressionWithSGD(training_set_std, parameters);

$$async$$ = model.train(nIterations, $$done$$);

null

# Cross validate on test set and generate ROC curve

In [12]:
var vdf = new CSVDataFrame(sc, fields, 'adult.test', ',', '?');
var evdf = vdf.drop(["Education"]).number_encode_features();
var test_set_std = evdf
    .toLabeledPoint("Target", ["*"])
    .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler);

// var predictionAndLabels = training_set_std.map((p, model) => [model.predict(p[1]), p[0]], model);
var predictionAndLabels = test_set_std.map((p, model) => [model.predict(p[1]), p[0]], model);			// inverse label
var metrics = new BinaryClassificationMetrics(predictionAndLabels);

$$async$$ = metrics.roc(function(err, roc) {
    var xy = {};
    for (var i in roc) xy[roc[i][1][0].toFixed(2)] = roc[i][1][1].toFixed(2);
    xy['0.00'] = '0.00';
    var data = {};
    data['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy;
    data['Random'] = {0 :0, 1 : 1};
    plot({title: 'Logistic Regression ROC Curve', data: data, filename: 'roc.png', finish: $$done$$});    
})

null

![title](roc.png)

In [13]:
console.log(model.weights)

[ 0.5892014010000115,
  0.21493410060228174,
  0.013819127163207047,
  0.8530827599943013,
  -0.014500366785590515,
  -0.182481500630961,
  -0.27119571075565263,
  -0.12998625855212997,
  -0.5131808211413618,
  0.7271266998919182,
  0.38625689011670966,
  0.5014530789388013,
  -0.03274029015397746 ]


undefined

In [18]:
$$async$$ = metrics.accuracyByThreshold(function(err, fscore) {
    console.log(fscore)
    $$done$$();
})

[ [ 0, 0.2456839309428951 ],
  [ 0.1, 0.4063745019920319 ],
  [ 0.2, 0.5415006640106241 ],
  [ 0.30000000000000004, 0.6488047808764941 ],
  [ 0.4, 0.7175298804780876 ],
  [ 0.5, 0.7592961487383798 ],
  [ 0.6000000000000001, 0.7865869853917663 ],
  [ 0.7000000000000001, 0.800929614873838 ],
  [ 0.8, 0.800199203187251 ],
  [ 0.9, 0.7886454183266932 ] ]


undefined