# Gini Vs Entropy
----

## Step 1: Import Required Modules

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

## Step 2: Load Data 

In [2]:
#os.chdir("C:\\Users\\Hi\\Google Drive\\01 Data Science Lab Copy\\02 Lab Data\\Python")
df = pd.read_csv("Wisconsin_Breast_Cancer_Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

In [3]:
X = df.iloc[:,2:32]
type(X)

pandas.core.frame.DataFrame

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 non-null float64
symmetry_se                569 

In [8]:
y=df["diagnosis"]
y[15:25]

15    M
16    M
17    M
18    M
19    B
20    B
21    B
22    M
23    M
24    M
Name: diagnosis, dtype: object

In [10]:
y = y.replace('M',1)
y = y.replace('B',0)
SEED = 1 # for reproducing

## Step 3: Create Training and Test sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,                                                    
                                                    random_state=SEED,
                                                    stratify=y)

In [12]:
X_train.shape # (455, 2)

(455, 30)

In [13]:
y_train.shape # (455,)

(455,)

In [14]:
X_test.shape # (114, 2)

(114, 30)

In [15]:
y_test.shape # (114,)

(114,)

## Step 4: Create Model using criterion as *entropy*

In [16]:
# Create dt_entropy model, set 'entropy' as the information criterion
dt_entropy = DecisionTreeClassifier(max_depth=8, 
                                    criterion='entropy', 
                                    random_state=SEED)

In [17]:
# Fit dt_entropy to the training set
dt_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [18]:
# Use dt_entropy to predict test set labels
y_pred = dt_entropy.predict(X_test)

In [19]:
# Evaluate accuracy_entropy
accuracy_entropy = accuracy_score(y_test, y_pred)

In [20]:
accuracy_entropy

0.9298245614035088

## Step 5: Create Model using criterion as gini

In [21]:
# Instantiate dt_gini, set 'gini' as the information criterion
dt_gini= DecisionTreeClassifier(max_depth=8, 
                                    criterion='gini', 
                                    random_state=SEED)

In [22]:
# Fit dt_entropy to the training set
dt_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [23]:
# Use dt_entropy to predict test set labels
y_pred_gini = dt_gini.predict(X_test)

In [24]:
# Evaluate accuracy_gini
accuracy_gini = accuracy_score(y_test, y_pred_gini)

In [25]:
accuracy_gini

0.9298245614035088

## Step 6: compare entropy and gini accuracy

In [26]:
# Print accuracy_entropy
print('Accuracy achieved by using entropy: ', accuracy_entropy)
# Print accuracy_gini
print('Accuracy achieved by using the gini index: ', accuracy_gini)

Accuracy achieved by using entropy:  0.9298245614035088
Accuracy achieved by using the gini index:  0.9298245614035088


> **Note:** Notice how the two models achieve exactly the same accuracy. Most of the time, the gini index and entropy lead to the same results. The gini index is **slightly** faster to compute and is the default criterion used in the DecisionTreeClassifier model of scikit-learn.