Load python modules. 

In [None]:
import numpy as np
import pandas as pd # Data frames
import sklearn as skl
import matplotlib.pyplot as plt # Visuals
%matplotlib inline
from urllib.request import urlopen # Get data from UCI Machine Learning Repository
from sklearn.linear_model import LogisticRegression #logistic regression functions
from sklearn.feature_selection import RFE
from sklearn.cross_validation import train_test_split #split into train and test sets
from sklearn import metrics
from sklearn.metrics import roc_curve, auc #visualizations 
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm, datasets


Import the dataframe. 

The dataset is from the UCI Machine Learning repository. Out of the fourteen variables, the only variables being used are age, workclass, fnlwgt, education, marital status, relationship, race, sex, capital gain, capital loss, native country, hours worked per week, and income.

We classified income as 0 if it is under $50,000 per year and 1 if it is above, and sex as 0 if male and 1 if female. 

In [118]:
data_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columnNames = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
columnsToUse = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'nativeCountry', 'hoursPerWeek', 'income']

census = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, names=columnNames, sep=',\s+')

census = census.loc[:, columnsToUse]

s = {'Male':0, 'Female':1}
inc = {'<=50K':0, '>50K':1}
census.replace({'income':inc, 'sex':s}, inplace=True)


census = pd.read_csv(urlopen(data_URL), names = names)





Since the attributes are primarily categorical, we need to find a way to classify it as numerical. So using the pandas function get_dummies, we created dummy columns for the variables. 

Having two columns for income is redundant, so we chose to delete the attribute for income being greater than $50,000.

Income is the variable we are predicting on, so to create our training and test sets, we removed them. 

In [119]:

censusDummies = pd.get_dummies(census)
censusDummies.head()

del censusDummies["income_ >50K"]


In [120]:
features = censusDummies.columns[:-1]
print(features)


Index(['age', 'fnlwgt', 'educationNum', 'capitalGain', 'capitalLoss',
       'hoursPerWeek', 'workclass_ ?', 'workclass_ Federal-gov',
       'workclass_ Local-gov', 'workclass_ Never-worked',
       ...
       'nativeCountry_ Portugal', 'nativeCountry_ Puerto-Rico',
       'nativeCountry_ Scotland', 'nativeCountry_ South',
       'nativeCountry_ Taiwan', 'nativeCountry_ Thailand',
       'nativeCountry_ Trinadad&Tobago', 'nativeCountry_ United-States',
       'nativeCountry_ Vietnam', 'nativeCountry_ Yugoslavia'],
      dtype='object', length=108)


Time to start the logistic regression analysis! 

We begin by splitting the data into training and test sets using an 80/20 ratio. 

In [122]:
logreg = LogisticRegression()
X = censusDummies
y = censusDummies.loc[:,'income_ <=50K']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13, test_size=0.2)
print("Train has " + str(X_train.shape[0]) + " entries")
print("Test has " + str(X_test.shape[0]) + " entries")

Train has 26048 entries
Test has 6513 entries


We generate a frequency table using to measure the relationship of the variables in the training and test sets. 

Using a logistic regression analysis will provide about 80% accuracy with the attributes we are including for the training set. For the test set, the accuracy is also about 80%. 

In [130]:
#train set
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_train)
print(pd.crosstab(y_train, y_pred, rownames=['Predicted Values'], colnames=['Actual Values']))
print('LogReg provides %s ' % metrics.accuracy_score(y_train, y_pred) + " accuracy")

Actual Values      0.0    1.0
Predicted Values             
0.0               1648   4639
1.0                695  19066
LogReg provides 0.795224201474  accuracy


In [131]:
#test set
y_pred_test = logreg.predict(X_test)
print(pd.crosstab(y_test, y_pred_test, rownames=['Predicted Values'], colnames=['Actual Values']))
print('LogReg provides %s ' % metrics.accuracy_score(y_test, y_pred_test) + " accuracy")

Actual Values     0.0   1.0
Predicted Values           
0.0               426  1128
1.0               165  4794
LogReg provides 0.801473975127  accuracy


Here are some sexy visualizations. 

This is an ROC (receiver operating characteristic) metric to evaluate classifier output quality. We want to plot the true positive rate on the Y axis and the false positive rate on the X axis. We predicted one class against the other and computed the ROC area for each class. 

In [None]:
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_pred_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
              ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

In [None]:
#fpr, tpr, _ = roc_curve(y_pred_test, y_score)