In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
adult_tmp = pd.read_csv("D:\\wb\\python\\ML\\adult\\adult_with_heading.csv")


In [3]:
adult_tmp.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')

In [4]:
adult_tmp['cap-gain-loss'] = adult_tmp['capital-gain'] + adult_tmp ['capital-loss']

In [5]:
adult_tmp.loc[:,'marital-status'] = adult_tmp['marital-status'].str.lstrip()  # column has prefix space

In [6]:
adult_tmp

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class,cap-gain-loss
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0


### Code for Partition the data

In [7]:
adult_train, adult_test = train_test_split(adult_tmp, test_size= 0.25, random_state = 7)

In [8]:
adult_tmp.shape

(32561, 16)

In [9]:
adult_train.shape

(24420, 16)

In [10]:
adult_test.shape


(8141, 16)

### randomly partition

Notes the "index" columns, they are randomly arranged !!
So be careful when manipulates dataFrame !!
It is because the index are not sequencial alignment when add, concate..etc with other dataFrame !!

In [11]:
adult_train.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class,cap-gain-loss
22659,22,Private,34616,Some-college,10,Never-married,Sales,Not-in-family,White,Female,0,0,30,United-States,<=50K,0
25090,31,Private,91666,HS-grad,9,Never-married,Exec-managerial,Own-child,White,Male,0,0,60,United-States,<=50K,0
18617,25,Private,193773,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Female,0,0,35,United-States,<=50K,0
8482,26,Private,247025,HS-grad,9,Never-married,Protective-serv,Unmarried,White,Male,0,0,44,United-States,<=50K,0
1544,69,?,473040,5th-6th,3,Divorced,?,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
10515,48,Federal-gov,147397,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,36,United-States,<=50K,0
12544,43,Local-gov,186995,HS-grad,9,Divorced,Protective-serv,Unmarried,White,Female,0,0,40,United-States,<=50K,0
28519,39,Federal-gov,376455,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1887,50,United-States,>50K,1887
1589,23,Private,121471,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K,0
27680,38,Private,105150,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,>50K,0


Build CART Decision Tree

In [12]:
y = adult_train[['class']]


In [13]:
mar_cat = adult_train[['marital-status']]

In [14]:
mar_cat[['marital-status']]

Unnamed: 0,marital-status
22659,Never-married
25090,Never-married
18617,Never-married
8482,Never-married
1544,Divorced
...,...
5699,Never-married
10742,Married-civ-spouse
16921,Married-civ-spouse
25796,Married-civ-spouse


In [15]:
# mar_cat.loc[:,'marital-status'] = mar_cat['marital-status'].str.lstrip()

In [16]:
mar_cat=mar_cat.replace({'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married'})

In [17]:
mar_cat

Unnamed: 0,marital-status
22659,Never-married
25090,Never-married
18617,Never-married
8482,Never-married
1544,Divorced
...,...
5699,Never-married
10742,Married
16921,Married
25796,Married


In [18]:
dummies = pd.get_dummies(mar_cat['marital-status'], dtype='uint8')

In [19]:
dummies

Unnamed: 0,Divorced,Married,Never-married,Separated,Widowed
22659,0,0,1,0,0
25090,0,0,1,0,0
18617,0,0,1,0,0
8482,0,0,1,0,0
1544,1,0,0,0,0
...,...,...,...,...,...
5699,0,0,1,0,0
10742,0,1,0,0,0
16921,0,1,0,0,0
25796,0,1,0,0,0


In [20]:
x = pd.concat((adult_train[['cap-gain-loss']], dummies), axis=1)
x

Unnamed: 0,cap-gain-loss,Divorced,Married,Never-married,Separated,Widowed
22659,0,0,0,1,0,0
25090,0,0,0,1,0,0
18617,0,0,0,1,0,0
8482,0,0,0,1,0,0
1544,0,1,0,0,0,0
...,...,...,...,...,...,...
5699,0,0,0,1,0,0
10742,0,0,1,0,0,0
16921,0,0,1,0,0,0
25796,2415,0,1,0,0,0


In [21]:
y_names=["<=50K", ">50K"]
x_names=["cap-gain-loss", "Divorced", "Married", "Never-married", "Separated",	"Widowed"]

In [22]:
cart01 = DecisionTreeClassifier(criterion = "gini", max_leaf_nodes=5).fit(x,y)

In [23]:
export_graphviz(cart01, out_file="D:\\wb\\python\\ML\\adult\\cart01.dot", feature_names=x_names, class_names=y_names)

### test the model

In [24]:
predClass = cart01.predict(x)

In [25]:
type(predClass)

numpy.ndarray

#### the output of predict() is a np.array !!

In [26]:
predClass=pd.Series(predClass, name='predict').to_frame(name='predict')

### we need to make index sequential (dataFrame x) first before concate the result !

In [27]:
predClass

Unnamed: 0,predict
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
24415,<=50K
24416,<=50K
24417,<=50K
24418,>50K


In [28]:
x_copy=x.copy()
x_copy

Unnamed: 0,cap-gain-loss,Divorced,Married,Never-married,Separated,Widowed
22659,0,0,0,1,0,0
25090,0,0,0,1,0,0
18617,0,0,0,1,0,0
8482,0,0,0,1,0,0
1544,0,1,0,0,0,0
...,...,...,...,...,...,...
5699,0,0,0,1,0,0
10742,0,0,1,0,0,0
16921,0,0,1,0,0,0
25796,2415,0,1,0,0,0


In [29]:
x_copy=x_copy.reset_index()
x_copy

Unnamed: 0,index,cap-gain-loss,Divorced,Married,Never-married,Separated,Widowed
0,22659,0,0,0,1,0,0
1,25090,0,0,0,1,0,0
2,18617,0,0,0,1,0,0
3,8482,0,0,0,1,0,0
4,1544,0,1,0,0,0,0
...,...,...,...,...,...,...,...
24415,5699,0,0,0,1,0,0
24416,10742,0,0,1,0,0,0
24417,16921,0,0,1,0,0,0
24418,25796,2415,0,1,0,0,0


In [30]:
pd.concat((x_copy,predClass), axis=1).head(30)

Unnamed: 0,index,cap-gain-loss,Divorced,Married,Never-married,Separated,Widowed,predict
0,22659,0,0,0,1,0,0,<=50K
1,25090,0,0,0,1,0,0,<=50K
2,18617,0,0,0,1,0,0,<=50K
3,8482,0,0,0,1,0,0,<=50K
4,1544,0,1,0,0,0,0,<=50K
5,10515,0,0,1,0,0,0,<=50K
6,12544,0,1,0,0,0,0,<=50K
7,28519,1887,0,1,0,0,0,>50K
8,1589,0,0,0,1,0,0,<=50K
9,27680,0,0,1,0,0,0,<=50K


In [31]:
def dummies(src):
	# convert categories to dummies variable
	src = src.replace({'Married-AF-spouse':'Married', 'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married'})
	return pd.get_dummies(src, dtype='uint8')

In [32]:
tmp= dummies(mar_cat['marital-status'])

In [33]:
tmp

Unnamed: 0,Divorced,Married,Never-married,Separated,Widowed
22659,0,0,1,0,0
25090,0,0,1,0,0
18617,0,0,1,0,0
8482,0,0,1,0,0
1544,1,0,0,0,0
...,...,...,...,...,...
5699,0,0,1,0,0
10742,0,1,0,0,0
16921,0,1,0,0,0
25796,0,1,0,0,0


In [34]:
pd.concat((adult_train[['cap-gain-loss']], dummies(mar_cat['marital-status'])), axis=1)

Unnamed: 0,cap-gain-loss,Divorced,Married,Never-married,Separated,Widowed
22659,0,0,0,1,0,0
25090,0,0,0,1,0,0
18617,0,0,0,1,0,0
8482,0,0,0,1,0,0
1544,0,1,0,0,0,0
...,...,...,...,...,...,...
5699,0,0,0,1,0,0
10742,0,0,1,0,0,0
16921,0,0,1,0,0,0
25796,2415,0,1,0,0,0


In [35]:
pd.concat((adult_train[['cap-gain-loss']], dummies(adult_train[['marital-status']])), axis=1)

Unnamed: 0,cap-gain-loss,marital-status_Divorced,marital-status_Married,marital-status_Never-married,marital-status_Separated,marital-status_Widowed
22659,0,0,0,1,0,0
25090,0,0,0,1,0,0
18617,0,0,0,1,0,0
8482,0,0,0,1,0,0
1544,0,1,0,0,0,0
...,...,...,...,...,...,...
5699,0,0,0,1,0,0
10742,0,0,1,0,0,0
16921,0,0,1,0,0,0
25796,2415,0,1,0,0,0
