## CART using Python

### Import required Libraries

In [1]:
import pandas                   as     pd
import numpy                    as     np
import sklearn.datasets         as     datasets

from   sklearn                  import tree
from   sklearn.tree             import DecisionTreeClassifier
from   sklearn.externals.six    import StringIO  
from   IPython.display          import Image  
from   sklearn.tree             import export_graphviz 

from   sklearn.cross_validation import cross_val_score
from   sklearn.cross_validation import KFold

import pydotplus
import pydot

from   sklearn.model_selection  import  train_test_split



### Read data file

In [2]:
df              = pd.read_csv("D:/Consulting/Saksoft/Analytics1/data/customer_data_03012018_final.csv")
df.columns

Index(['Customer Id', 'CustomerChurn', 'LastInvoiceDate', 'AvgInvoiceAmount',
       'LocationName', 'Zip', 'AvgMiles', 'Invoicevalue_range', 'miles_range',
       'NoCalls', 'ClaimsMade', 'DelayedQuote', 'PickupDelay', 'DeliveryDelay',
       'RevenueDecrease', 'PriceChangedPostQuote', 'CompetitorsPresent ',
       'Current Inflation'],
      dtype='object')

### Understand Data

In [3]:
df.columns        = ['Customer Id', 'CustomerChurn', 'LastInvoiceDate', 'AvgInvoiceAmount','LocationName',\
                     'Zip', 'AvgMiles', 'Invoicevalue_range', 'miles_range','NoCalls', 'ClaimsMade',\
                     'DelayedQuote', 'PickupDelay', 'DeliveryDelay','RevenueDecrease',\
                     'PriceChangedPostQuote', 'CompetitorsPresent','Current Inflation']
feature_names   = df.columns
print(feature_names)
print(df.info())

Index(['Customer Id', 'CustomerChurn', 'LastInvoiceDate', 'AvgInvoiceAmount',
       'LocationName', 'Zip', 'AvgMiles', 'Invoicevalue_range', 'miles_range',
       'NoCalls', 'ClaimsMade', 'DelayedQuote', 'PickupDelay', 'DeliveryDelay',
       'RevenueDecrease', 'PriceChangedPostQuote', 'CompetitorsPresent',
       'Current Inflation'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 18 columns):
Customer Id              600 non-null int64
CustomerChurn            600 non-null int64
LastInvoiceDate          600 non-null object
AvgInvoiceAmount         600 non-null int64
LocationName             600 non-null object
Zip                      600 non-null object
AvgMiles                 600 non-null float64
Invoicevalue_range       600 non-null object
miles_range              600 non-null object
NoCalls                  600 non-null int64
ClaimsMade               600 non-null int64
DelayedQuote             600 non-null int64

In [4]:
print(df.head(2).T)

                                      0                      1
Customer Id                       26342                  38851
CustomerChurn                         0                      0
LastInvoiceDate               05-Aug-14              17-Oct-17
AvgInvoiceAmount                   4935                  24312
LocationName           Carlyle,Illinois  White Rock,New Mexico
Zip                               62231                  87547
AvgMiles                        433.502                2401.16
Invoicevalue_range        (2e+03,1e+04]          (2e+04,3e+04]
miles_range               (2e+02,1e+03]          (2e+03,3e+03]
NoCalls                               0                      0
ClaimsMade                            0                      0
DelayedQuote                          0                      1
PickupDelay                           0                      1
DeliveryDelay                         0                      1
RevenueDecrease                       0                

### Observations

* There are no missing values and data looks OK
* All variables are numeric and the following variables need to be converted to Categorical variables:

| Sl No | Variable |
|-- | ----------------------- |
| 1 | Customer Id |
| 2 | CustomerChurn |
| 3 | LastInvoiceDate |
| 4 | NoCalls |
| 5 | ClaimsMade |
| 6 | DelayedQuote |
| 7 | PickupDelay |
| 8 | DeliveryDelay |
| 9 | RevenueDecrease |
| 10 | PriceChangedPostQuote |
| 11 | CompetitorsPresent |
| 12 | Current Inflation | 

In [5]:
categorical_var =  ['Customer Id','CustomerChurn','LastInvoiceDate','NoCalls','ClaimsMade',\
                    'DelayedQuote','PickupDelay','DeliveryDelay','RevenueDecrease','PriceChangedPostQuote',\
                    'CompetitorsPresent','Current Inflation']
numerical_var   =  feature_names.drop(categorical_var)
print(categorical_var)
print(numerical_var)

['Customer Id', 'CustomerChurn', 'LastInvoiceDate', 'NoCalls', 'ClaimsMade', 'DelayedQuote', 'PickupDelay', 'DeliveryDelay', 'RevenueDecrease', 'PriceChangedPostQuote', 'CompetitorsPresent', 'Current Inflation']
Index(['AvgInvoiceAmount', 'LocationName', 'Zip', 'AvgMiles',
       'Invoicevalue_range', 'miles_range'],
      dtype='object')


In [6]:
for i in range(len(categorical_var)):
    df[categorical_var[i]] = df[categorical_var[i]].astype('category') 

In [7]:
df.describe()

Unnamed: 0,AvgInvoiceAmount,AvgMiles
count,600.0,600.0
mean,16630.098333,1620.235688
std,11107.307773,1110.807322
min,912.0,31.218972
25%,8575.75,815.385751
50%,14247.0,1383.05557
75%,22125.75,2171.013968
max,92261.0,9191.058795


In [8]:
df.describe(include = ['category'])

Unnamed: 0,Customer Id,CustomerChurn,LastInvoiceDate,NoCalls,ClaimsMade,DelayedQuote,PickupDelay,DeliveryDelay,RevenueDecrease,PriceChangedPostQuote,CompetitorsPresent,Current Inflation
count,600,600,600,600,600,600,600,600,600,600,600,600
unique,596,2,499,6,4,2,2,2,2,2,2,2
top,55325,0,19-Feb-17,0,0,0,0,0,0,0,0,1
freq,2,528,4,531,571,566,564,581,576,579,580,324


### Create X, independent variable and y dependent variable

In [9]:
feature_names  =  feature_names.drop(['CustomerChurn','Customer Id','LastInvoiceDate'])
X              =  df.loc[:,feature_names] 
y              =  df.loc[:,'CustomerChurn']
label_names    =  y.unique()
print(X.shape)
print(y.shape)

(600, 15)
(600,)


### Check if the data set is balanced or not

In [10]:
y.value_counts().apply(lambda x: [x, ( x * 100) / df.shape[0]])

0    [528, 88.0]
1     [72, 12.0]
Name: CustomerChurn, dtype: object

Since class, 1 (Churn) customers form only 12% of the data set, we have a unbalanced data set.

### Split the data into training and test data set

In [11]:
seed      = 12234
test_size = 0.75
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = seed, stratify = y) 

Recursive partitioning is a fundamental tool in data mining. It helps us explore the structure of a set of data, while developing easy to visualize decision rules for predicting a categorical (classification tree) or continuous (regression tree) outcome.

CART Modeling via DecisionTreeClassifier

Classification and Regression Trees (as described by Brieman, Freidman, Olshenm and Stone) can be generated through the DecisionTreeClassifier package.

i) Grow the tree

For controlling tree growth, we set the following parameters:
* max_depth: The maximum depth of the tree.
* min_samples_split: The minimum number of samples required to split an internal node
* min_samples_leaf: min no of samples at a leaf node
* min_impurity_decrease : A node will be split if this split induces a decrease of the impurity greater than or equal to this value. 

In [12]:
feature_names = ['NoCalls', 'ClaimsMade', 'DelayedQuote', 'PickupDelay', \
                 'DeliveryDelay', 'RevenueDecrease', 'PriceChangedPostQuote', 'CompetitorsPresent',\
                 'Current Inflation']
class_names   =  ['No Churn','Churned']

In [13]:
clf   =  DecisionTreeClassifier(criterion = "gini", splitter = 'random', \
                                    min_samples_split = 200, min_samples_leaf = 50, min_impurity_decrease = 2,\
                                    max_leaf_nodes = 10, max_depth= 60)
clf.fit(X_train,y_train)

ValueError: could not convert string to float: 'Sturgeon,Pennsylvania'

In [None]:
dot_data = StringIO()
export_graphviz(clf, out_file = dot_data, feature_names = feature_names, class_names = class_names, \
                rounded = True, proportion = False, precision = 2, filled = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  

In [None]:
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

In [None]:
Image('tree.png')

### Prune the tree

In [None]:
for ln in range(100,2000,200):
    dtree = tree.DecisionTreeClassifier(criterion = "gini", splitter = 'random', \
                                    max_leaf_nodes = ln, min_samples_leaf = 500,\
                                    max_depth= 5000)
    dtree.fit(X_train,y_train)
    dot_data = StringIO()
    export_graphviz(clf, out_file = dot_data, feature_names = feature_names, class_names = class_names, \
                rounded = True, proportion = False, precision = 2, filled = True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
    (graph,) = pydot.graph_from_dot_file('tree.dot')
    graph.write_png('tree.png')
    Image('tree.png')

http://dataaspirant.com/2017/02/01/decision-tree-algorithm-python-with-scikit-learn/