# Brief notes to talk over

- This is the first hack night that we've done, so I'm only going to talk for a minute or two...
- The point: to get all of you coding, practicing, learning, helping each other out.
- What is Kaggle?
  - Show page, give some examples
  - See Meetup page for the link.
  - Sign up at Kaggle if you haven't yet
  - What is our dataset? What should be predicted?
  - Show Overview and other tabs
  - Public / Private Leaderboard and other information
- May put up examples on the projector if many people are stuck at around the same point
- Perhaps stop with 15 minutes to go (how long do we have room?), look at private leaderboard, see who is near the top and if they'd like to explain their methodology

# Private

- Kaggle competition link is: https://www.kaggle.com/t/b4c66c1c24cc4a7983156152a7e0f84f
- Dataset is https://archive.ics.uci.edu/ml/datasets/Adult
- Kaggle training data & labels is just `adult.data` from there.
- To generate test data, labels, and a sample submission, use this shell kludge:

```
TMPFILE="adult.test.tmp"
DATA="adult.test.X"
TARGETS="adult.test.y"
SAMPLE="submission_sample.csv"

tail -n +2 < adult.test | cut -d , -f 15 | sed "s/ <=50K./0/g" | sed "s/ >50K./1/g" | sed '/^\s*$/d' > ${TMPFILE}

echo "id,label" > ${TARGETS}
cat ${TMPFILE} | awk '{printf "%d,%s\n", NR-1, $0}' >> ${TARGETS}
echo "id,label" > ${SAMPLE}
shuf < ${TMPFILE} | awk '{printf "%d,%s\n", NR-1, $0}' >> ${SAMPLE}

tail -n +2 < adult.test | cut -d , -f 15 --complement > ${DATA}

rm ${TMPFILE}
```

## Reading data

In [1]:
import pandas as pd
import xgboost



In [2]:
columns = ("age", "workclass", "fnlwgt", "education",
           "education_num", "marital_status", "occupation",
           "relationship", "race", "sex", "capital_gain",
           "capital_loss", "hours_per_week", "native_country",
           "income")
# Data is CSV with no header; question mark indicates NA
data = pd.read_csv("adult.data", names=columns, skipinitialspace=True,
                   na_values="?", index_col=False)

In [3]:
test_data = pd.read_csv("adult.test.X", names=columns[:-1], skipinitialspace=True,
    na_values="?", index_col=False)

## Plots

In [None]:
plots = data.hist()
for row in plots:
    for subplot in row:
        subplot.axis('off')
plt.show()

## Preprocessing

In [4]:
def fill_missing(df):
    """Modifies the input dataframe to fill missing data in the columns
    workclass, occupation, and native_country."""
    # Fill the NAs in workclass with the mode ('Private' outnumbers
    # all other categories combined):
    df.workclass.fillna("Private", inplace=True)
    # Do likewise for native_country (vast majority are from US):
    df.native_country.fillna("United-States", inplace=True)
    # NAs in occupation occur primarily where workclass is also NA,
    # but no particular value dominates all the others.  This is still
    # ~6% of our data - so for now, fill it with a new value and treat
    # it perhaps like it has information.
    df.occupation.fillna("Other", inplace=True)

In [5]:
fill_missing(data)
fill_missing(test_data)

In [6]:
def feature_xform(df):
    """Given raw data (as from 'read_data'), selects and transforms
    features, including turning categorical columns into numerical
    form.

    Returns (X, y) where 'X' is a DataFrame for features and 'y' is a
    Series for the corresponding labels (where 0 is <= 50K, and 1 is >
    50K).
    """
    # Extract just the features (everything but 'income'):
    cols = [c for c in df.columns if c != 'income']
    X = df[cols]
    # One-hot encode everything in this tuple, join it to X, and
    # drop the original column:
    onehot_cols = ("workclass", "education", "marital_status", "occupation",
                   "relationship", "race", "native_country")
    for col in onehot_cols:
        feature = X[col]
        feature_onehot = pd.get_dummies(feature, col)
        X = X.join(feature_onehot).drop(col, axis=1)
    # Gender is binary (here at least):
    X = X.assign(male = (X.sex == "Male")*1).drop("sex", axis=1)
    # 'fnlwgt' appears to be meaningless here as it's relative to the
    # state, which isn't given:
    X = X.drop("fnlwgt", axis=1)
    # 'capital_gain' and 'capital_loss' never appear together and can
    # probably be turned to one feature:
    X = X.assign(net_capital = X.capital_gain - X.capital_loss).\
          drop(["capital_gain", "capital_loss"], axis=1)
    return X

In [7]:
X_train = feature_xform(data)
X_test = feature_xform(test_data)

In [8]:
y_train = (data.income == ">50K") * 1

In [9]:
X_train = X_train.drop("native_country_Holand-Netherlands", axis=1)

# xgboost training, prediction & submission

In [10]:
xgb = xgboost.XGBClassifier(nthread=-1, seed=12345, n_estimators=150)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=12345, silent=True, subsample=1)

In [39]:
order_importance(X_train.columns, xgb.feature_importances_, "Importance")[:25]

Unnamed: 0,Feature,Importance
0,net_capital,0.287968
1,age,0.16568
2,hours_per_week,0.088757
3,education_num,0.084813
4,marital_status_Married-civ-spouse,0.043393
5,relationship_Wife,0.029586
6,male,0.018738
7,workclass_Self-emp-not-inc,0.018738
8,occupation_Exec-managerial,0.017751
9,workclass_Private,0.014793


In [11]:
y_test_pred = xgb.predict(X_test)

In [15]:
submission = pd.DataFrame(y_test_pred, columns=["label"])
submission.index.name = "id"
pd.DataFrame.to_csv(submission, "submission.csv")

# Cross-validation example

In [30]:
# Beware, this is slow to run (especially when using all input variables)
from sklearn.model_selection import cross_val_score
import sklearn.svm
clf = sklearn.svm.SVC(kernel='rbf')
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

array([ 0.85920467,  0.86210074,  0.86701474,  0.8647113 ,  0.86532555])

In [31]:
# This is much faster
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Accuracy: %0.3f ± %0.3f" % (scores.mean(), scores.std() * 2))

Accuracy: 0.846 ± 0.011


# Random Forests & feature importance

In [43]:
def order_importance(columns, ranks, name, ascending=False):
    features = pd.DataFrame(
        {"Feature": list(columns),
         name: ranks}
    )
    features.sort_values(name, inplace=True, ascending=ascending)
    return features.reset_index(drop=True)

In [45]:
import sklearn.ensemble
rf = sklearn.ensemble.RandomForestClassifier(n_jobs=-1)
rf = rf.fit(X_train, y_train)
order_importance(X_train.columns, rf.feature_importances_, "Importance")[:25]

Unnamed: 0,Feature,Importance
0,age,0.242081
1,net_capital,0.124096
2,hours_per_week,0.119954
3,relationship_Husband,0.075012
4,education_num,0.064737
5,marital_status_Married-civ-spouse,0.060024
6,occupation_Exec-managerial,0.0187
7,occupation_Prof-specialty,0.017527
8,male,0.017452
9,marital_status_Never-married,0.015837


## RFECV with a simple classifier to rank features

In [46]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.feature_selection
clf = DecisionTreeClassifier()
selector = sklearn.feature_selection.RFECV(clf)
selector = selector.fit(X_train, y_train)
order_importance(X_train.columns, selector.ranking_, "RFE rank", True)[:25]

Unnamed: 0,Feature,RFE rank
0,net_capital,1
1,education_num,1
2,marital_status_Married-civ-spouse,1
3,age,2
4,hours_per_week,3
5,workclass_Private,4
6,occupation_Craft-repair,5
7,race_White,6
8,native_country_United-States,7
9,male,8
