http://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np

In [3]:
train=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")

In [4]:
train.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
               'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
              'hours_pw', 'native_country', 'income']
train.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_pw
count,32560.0,32560.0,32560.0,32560.0,32560.0,32560.0
mean,38.581634,189781.8,10.08059,1077.615172,87.306511,40.437469
std,13.640642,105549.8,2.572709,7385.402999,402.966116,12.347618
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117831.5,9.0,0.0,0.0,40.0
50%,37.0,178363.0,10.0,0.0,0.0,40.0
75%,48.0,237054.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
train['workclass'] = train.workclass.str.strip()
train['income'] = train.income.str.strip()
train['marital_status'] = train.marital_status.str.strip()
train['race'] = train.race.str.strip()
train['sex'] = train.sex.str.strip()
train['relationship'] = train.relationship.str.strip()

In [6]:
# create numerical columns representing the categorical data
# workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay
# , Never-worked. 
train['workclass_num'] = train.workclass.map({'?':0, 'Private':8, 'State-gov':1, 'Federal-gov':2
                                            , 'Self-emp-not-inc':3, 'Self-emp-inc':4, 'Local-gov':5
                                            , 'Without-pay':6, 'Never-worked':7})
train['over50K'] = np.where(train.income == '<=50K', False, True)
# marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent
# , Married-AF-spouse. 
train['marital_num'] = train['marital_status'].map({'?':0, 'Widowed':7, 'Divorced':1, 'Separated':2
                                                  , 'Never-married':3, 'Married-civ-spouse':4
                                                  , 'Married-AF-spouse':4, 'Married-spouse-absent':5
                                                   , 'Separated':6})
# race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 
train['race_num'] = train.race.map({'?':0, 'White':5, 'Black':1, 'Asian-Pac-Islander':2
                                    , 'Amer-Indian-Eskimo':3, 'Other':4})
train['sex_num'] = np.where(train.sex == 'Female', 0, 1)
# relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
train['rel_num'] = train.relationship.map({'?':0, 'Not-in-family':2, 'Unmarried':2, 'Own-child':2
                                         , 'Other-relative':0, 'Husband':1, 'Wife':1})

In [7]:
train.corr()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_pw,workclass_num,over50K,marital_num,race_num,sex_num,rel_num
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756,-0.169729,0.234039,0.202199,0.026663,0.088832,-0.24441
fnlwgt,-0.076646,1.0,-0.043159,0.000437,-0.010259,-0.01877,0.037738,-0.009481,-0.004061,-0.078802,0.026882,0.009498
education_num,0.036527,-0.043159,1.0,0.122627,0.079932,0.148127,-0.078247,0.335182,-0.027389,0.0517,0.012256,-0.027984
capital_gain,0.077674,0.000437,0.122627,1.0,-0.031614,0.078409,-0.022263,0.223333,0.030314,0.016099,0.048477,-0.065813
capital_loss,0.057775,-0.010259,0.079932,-0.031614,1.0,0.054256,-0.015258,0.150523,0.034538,0.020751,0.045572,-0.063479
hours_pw,0.068756,-0.01877,0.148127,0.078409,0.054256,1.0,0.038689,0.22969,0.003319,0.052206,0.229312,-0.161909
workclass_num,-0.169729,0.037738,-0.078247,-0.022263,-0.015258,0.038689,1.0,-0.038364,-0.050884,0.012609,-0.011282,0.035783
over50K,0.234039,-0.009481,0.335182,0.223333,0.150523,0.22969,-0.038364,1.0,0.16869,0.084973,0.215995,-0.349548
marital_num,0.202199,-0.004061,-0.027389,0.030314,0.034538,0.003319,-0.050884,0.16869,1.0,-0.004148,0.13744,-0.353754
race_num,0.026663,-0.078802,0.0517,0.016099,0.020751,0.052206,0.012609,0.084973,-0.004148,1.0,0.110271,-0.065798


In [50]:
X = train[['workclass_num', 'education_num', 'marital_num', 'race_num', 'sex_num', 'rel_num'
          , 'capital_gain', 'capital_loss', 'age', 'hours_pw']]
y = train.over50K

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [40]:
alg = LogisticRegression(solver='liblinear')

# Compute the accuracy score for all the cross validation folds.
scores = cross_val_score(alg, X, y, cv=3)
# Take the mean of the scores (because we have one for each fold)
print("scores.mean() for LogisticRegression is", scores.mean())

# Train the algorithm using all the training data
alg.fit(X, y)

scores.mean() for LogisticRegression is 0.8277027410217891


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [41]:
kf=KFold(3)

predictions = []
for tr, test in kf.split(train):
    # The predictors we're using the train the algorithm.  
    train_predictors = (X.iloc[tr,:])
    # The target we're using to train the algorithm.
    train_target = y.iloc[tr]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(X.iloc[test,:])
    predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

accuracy = sum(predictions[predictions == y]) / float(len(predictions))
scores = cross_val_score(alg, X, y, cv=3)
print("scores.mean() is", scores.mean())
print("accuracy of logistic regression is", accuracy)

scores.mean() is 0.8214678915968817
accuracy of logistic regression is 0.12527641277641277


In [23]:
from sklearn import tree

In [44]:
# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(X, y)
# Look at the importance and score of the included features
#print("Look at the importance and score of the included features from my tree one")
#print(X.columns.to_list())
#print("importance features from my tree one")
#print(my_tree_one.feature_importances_)
print("score of the included features from my tree one", my_tree_one.score(X, y))

score of the included features from my tree one 0.8722972972972973


In [45]:
#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(X, y)
#Print the score of the new decison tree
#print("my tree two feature_importances_")
#print(my_tree_two.feature_importances_)
print("score of the included features from my tree two", my_tree_two.score(X, y))


score of the included features from my tree two 0.8616707616707616


In [35]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(X, y)

# Print the score of the fitted random forest
print("Print the score of the fitted random forest", my_forest.score(X, y))

scores = cross_val_score(alg, X, y, cv=3)

# Take the mean of the scores (because we have one for each fold)
print("Mean score for Random forest classifier", scores.mean())

#Request and print the `.feature_importances_` attribute
#print("print the `.feature_importances_` attribute")
#print("my tree two")
#print(my_tree_two.feature_importances_)
#print("my forest")
#print(my_forest.feature_importances_)

Print the score of the fitted random forest 0.859490171990172
Mean score for Random forest classifier 0.8264743095895112
