In [1]:
# -----------------------------------------------------------------
# Decision Tree Classifier
# Predict the income of an adult based on the census data
# -----------------------------------------------------------------

In [2]:
# Import libraries
import pandas as pd

In [3]:
# Read dataset
data = pd.read_csv('AdultIncomeData.csv')
data

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K
...,...,...,...,...,...,...,...,...
19782,53,Private,Masters,Married,White,Male,40,>50K
19783,22,Private,Some-college,Never-married,White,Male,40,<=50K
19784,40,Private,HS-grad,Married,White,Male,40,>50K
19785,58,Private,HS-grad,Widowed,White,Female,40,<=50K


In [4]:
# Check for Null values
data.isnull().sum(axis=0)

age               0
wc                0
education         0
marital status    0
race              0
gender            0
hours per week    0
IncomeClass       0
dtype: int64

In [5]:
# Check the data types of each field
data.dtypes

age                int64
wc                object
education         object
marital status    object
race              object
gender            object
hours per week     int64
IncomeClass       object
dtype: object

In [6]:
# Create Dummy variables
data_prep = pd.get_dummies(data, drop_first=True)
data_prep

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [7]:
# Create X and Y Variables
X = data_prep.iloc[:, :-1]
Y = data_prep.iloc[:, -1]

In [8]:
# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y, test_size = 0.3, random_state = 42, stratify=Y)

In [9]:
# Import and train classifier
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, Y_train)

DecisionTreeClassifier(random_state=42)

In [11]:
# Test the model
Y_predict = dtc.predict(X_test)
Y_predict

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [12]:
# Evaluate the model
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, Y_predict)
cm

array([[3825,  548],
       [ 727,  837]])

In [14]:
score = dtc.score(X_test, Y_test)
score

0.7852450732693279