In [1]:
# -----------------------------------------------------------------
# Compare multiple Classifiers for different train and test values
# -----------------------------------------------------------------

In [2]:
# Import libraries
import pandas as pd

In [3]:
# Read dataset
data = pd.read_csv('AdultIncomeData.csv')
data

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K
...,...,...,...,...,...,...,...,...
19782,53,Private,Masters,Married,White,Male,40,>50K
19783,22,Private,Some-college,Never-married,White,Male,40,<=50K
19784,40,Private,HS-grad,Married,White,Male,40,>50K
19785,58,Private,HS-grad,Widowed,White,Female,40,<=50K


In [4]:
# Create Dummy variables
data_prep = pd.get_dummies(data, drop_first=True)

In [5]:
data_prep

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [6]:
# Create X and Y Variables
X = data_prep.iloc[:, :-1]
Y = data_prep.iloc[:, -1]

In [7]:
# Import and train Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=1234)

# Import and train Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)

# Import and train Support Vector Classifier
from sklearn.svm import SVC
svc = SVC(kernel='rbf', gamma=0.5)

In [8]:
# Import and perform cross validation
from sklearn.model_selection import cross_validate
cv_results_dtc = cross_validate(dtc, X, Y, cv=10, return_train_score=True)
cv_results_rfc = cross_validate(rfc, X, Y, cv=10, return_train_score=True)
cv_results_svc = cross_validate(svc, X, Y, cv=10, return_train_score=True)

In [9]:
# Get the average of all the results
import numpy as np
dtc_test_average = np.average(cv_results_dtc['test_score'])
rfc_test_average = np.average(cv_results_rfc['test_score'])
svc_test_average = np.average(cv_results_svc['test_score'])

In [10]:
print('dtc_test_average:', dtc_test_average)
print('rfc_test_average:', rfc_test_average)
print('svc_test_average:', svc_test_average)

dtc_test_average: 0.7810691737459707
rfc_test_average: 0.7971904185045097
svc_test_average: 0.8036085674097743


In [11]:
# Get the average of all the training scores
dtc_train_average = np.average(cv_results_dtc['train_score'])
rfc_train_average = np.average(cv_results_rfc['train_score'])
svc_train_average = np.average(cv_results_svc['train_score'])

In [12]:
print('dtc_train_average:', dtc_train_average)
print('rfc_train_average:', rfc_train_average)
print('svc_train_average:', svc_train_average)

dtc_train_average: 0.9043535882172298
rfc_train_average: 0.9042918191721612
svc_train_average: 0.8744798848765454


In [13]:
# print the results 
print()
print()
print('        ','Decision Tree  ', 'Random Forest  ','Support Vector   ')
print('        ','---------------', '---------------','-----------------')

print('Test  : ',
      round(dtc_test_average, 4), '        ',
      round(rfc_test_average, 4), '        ',
      round(svc_test_average, 4))

print('Train : ',
      round(dtc_train_average, 4), '        ',
      round(rfc_train_average, 4), '        ',
      round(svc_train_average, 4))




         Decision Tree   Random Forest   Support Vector   
         --------------- --------------- -----------------
Test  :  0.7811          0.7972          0.8036
Train :  0.9044          0.9043          0.8745
