In [1]:
# Decision Tree
# Dataset: census dataset
'''
from anaconda prompt, type 
pip install pydotplus + <ENTER>
conda install python-graphviz
'''

# import the libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report as cr

from sklearn import tree
from IPython.display import Image

from pandas_ml import ConfusionMatrix

import numpy as np
from sklearn.feature_selection import RFE
from sklearn import preprocessing # for label encoding
from io import StringIO
import pydotplus



# read the input file
path="D:\data science\csv\census1.csv"
census = pd.read_csv(path)
census.head()

# number of rows
len(census)

# print the columns
col = list(census.columns)
print(col)

# count of Rows and Columns
census.shape

# describe the dataset (R,C)
# --------------------------
census.dtypes

# get the counts of the classes
print(census.salary.value_counts())

# randomly shuffle the dataset. 
# frac=1 means returns all rows after sorting
# census = census.sample(frac=1)
# census.head(50)

# print the columns
print(census.columns)

fact_cols = census.select_dtypes(include=["object"]).columns.values
print(fact_cols)


# analyse the factor columns
for f in fact_cols:
    print("Factor variable = ", f)
    print(census[f].unique())
    print("***")


# 1) remove spaces
def removeSpaces(x):
    x = x.strip()
    return(x)

# parameters:
# df = dataframe (pandas)
# factv: factor variable
# oldv: list of old values to be replaced
# newv: new value to be replaced with
    
def replaceFactorValues(df,factv,oldv,newv):
    if (len(factv) == len(oldv) == len(newv) ):
        
        for i in range(0,len(factv)):
            df[factv[i]] [df[factv[i]].isin(oldv[i])] = newv[i]
            
            # internally, the above code translates to ...
            # census.workclass [census.workclass.isin(['','',''])] = 'new'
        msg = "SUCCESS: 1 Updates done"
    else:
        msg = "ERRCODE:-1  Inconsistent length in the input lists"
    
    return(msg)
 
    
# apply the function to all the factor variables
for f in fact_cols:
    census[f] = census[f].apply(removeSpaces) #apply function passes each row to the called function in series format and return whole output
    
# 2) reduce levels
factv=[]; oldv=[]; newv=[]

factv.append("workclass")
oldv.append(['State-gov','Federal-gov', 'Local-gov', '?'])
newv.append('govt')

factv.append("workclass")
oldv.append(['Self-emp-not-inc', 'Self-emp-inc'])
newv.append("self")

factv.append("workclass")
oldv.append(['Private','Without-pay', 'Never-worked'])
newv.append("private")

factv.append("education")
oldv.append(['9th', '7th-8th', '5th-6th', '10th', '1st-4th', 'Preschool'])
newv.append("school")

factv.append("education")
oldv.append(['HS-grad', '11th', '12th'])
newv.append("high school")

factv.append("education")
oldv.append(['Bachelors', 'Masters', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Doctorate', 'Prof-school'])
newv.append("college")

factv.append("marital")
oldv.append(['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'])
newv.append("married")

factv.append("marital")
oldv.append(['Divorced', 'Separated'])
newv.append("divorced")

factv.append("marital")
oldv.append(['Never-married', 'Widowed'])
newv.append("single")


print(factv)
print(oldv)
print(newv)


ret = replaceFactorValues(census,factv,oldv,newv)
print(ret)


# Exercise: similarly, do the same for the other factor variables also


# replace all the factors with numbers
# label encoding

def convertFactorsToNum(df,fact_cols):
    le = preprocessing.LabelEncoder()
    
    for f in fact_cols:
        df[f] = le.fit_transform(df[f])
    
    return(1)

ret = convertFactorsToNum(census,fact_cols)
print(ret)


# split the data into train and test
train,test = train_test_split(census,test_size=0.3)

# split further into trainx,trainy / testx,testy
trainx=train.drop(['salary'],axis=1)
trainy=train['salary']
print('TRAIN: {} {}'.format(trainx.shape,trainy.shape))

testx=test.drop(['salary'],axis=1)
testy=test['salary']
print('TEST: {} {}'.format(testx.shape,testy.shape))


# build the Decision Tree models
# 1) GINI model
# 2) ENTROPY model

# 1) GINI model
m_gini = dtc(criterion="gini",max_depth=5,
             min_samples_leaf=5).fit(trainx,trainy)
print(m_gini)

# predict 
p_gini = m_gini.predict(testx)

# confusion matrix
cm1 = ConfusionMatrix(list(testy), list(p_gini))
print(cm1)

# classification report
print(cr(testy,p_gini))

# accuracy score
print("Gini Accuracy = ", accuracy_score(testy,p_gini)*100)

testy.value_counts()


# 2) ENTROPY model
m_entropy = dtc(criterion="entropy",max_depth=5,
             min_samples_leaf=5).fit(trainx,trainy)

p_entropy = m_entropy.predict(testx)

# confusion matrix
cm2 = ConfusionMatrix(list(testy), list(p_entropy))
cm2

# classification report
print(cr(testy,p_entropy))

# accuracy score
print("Entropy Accuracy = ", accuracy_score(testy,p_entropy)*100)


# visualise the decision tree
# tree visualisation
features = list(trainx.columns)
features

# y-variable - classes
classes = trainy.unique()
classes

dot_data = StringIO()
tree.export_graphviz(m_gini, out_file=dot_data,
                     filled=True,
                     rounded=True,
                     special_characters=True,
                     feature_names = features,
                     class_names = ["0","1"])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())


# feature selection
# RFE: recursive feature elimination

rfe = RFE(m_gini,3).fit(testx,testy)
support = rfe.support_
ranking = rfe.ranking_

list_cols = trainx.columns

df_rfe = pd.DataFrame({"columns":list_cols, 
                           "support":support, 
                           "ranking":ranking})
    
print(df_rfe.sort_values('ranking'))


# Assignment: 
# next model will be only with the significant features 
# i.e features that have ranking=1

# compare the results of the 2 models

# 2) build a logistic regression model and compare the results with DT




['age', 'workclass', 'education', 'marital', 'relationship', 'race', 'gender', 'salary']
<=50K    14098
>50K      4433
Name: salary, dtype: int64
Index(['age', 'workclass', 'education', 'marital', 'relationship', 'race',
       'gender', 'salary'],
      dtype='object')
['workclass' 'education' 'marital' 'relationship' 'race' 'gender' 'salary']
Factor variable =  workclass
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
***
Factor variable =  education
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
***
Factor variable =  marital
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
***
Factor variable =  relationship
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
***

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


SUCCESS: 1 Updates done
1
TRAIN: (12971, 7) (12971,)
TEST: (5560, 7) (5560,)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
Predicted  False  True  __all__
Actual                         
False       3811   397     4208
True         645   707     1352
__all__     4456  1104     5560
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      4208
           1       0.64      0.52      0.58      1352

    accuracy                           0.81      5560
   macro avg       0.75      0.71      0.73      5560
weighted avg       0.80      0.81      0.81      5560

Gini Accuracy =  81.2589928057554


InvocationException: GraphViz's executables not found

In [2]:
fact_cols

array(['workclass', 'education', 'marital', 'relationship', 'race',
       'gender', 'salary'], dtype=object)

In [3]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18531 entries, 0 to 18530
Data columns (total 8 columns):
age             18531 non-null int64
workclass       18531 non-null int32
education       18531 non-null int32
marital         18531 non-null int32
relationship    18531 non-null int32
race            18531 non-null int32
gender          18531 non-null int32
salary          18531 non-null int32
dtypes: int32(7), int64(1)
memory usage: 651.6 KB
