In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset 'census_cleaned.csv'
df_census = pd.read_csv('census_cleaned.csv')

# Split data into X and y
X = df_census.iloc[:,:-1]
y = df_census.iloc[:,-1]

# Import train_test_split
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [3]:
# Import Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier

# Import accuracy_score
from sklearn.metrics import accuracy_score

# Initialize classification model
clf = DecisionTreeClassifier(random_state=2)

# Fit model on training data
clf.fit(X_train, y_train)

# Make predictions for test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy_score(y_pred, y_test)

0.8131679154894976

In [5]:
# Download bike_rentals_cleaned dataset
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')

# Split data into X and y
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

# Import Linear Regression
from sklearn.linear_model import LinearRegression

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [6]:
# Import Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

# Import cross_val_score
from sklearn.model_selection import cross_val_score

In [7]:
# Initialize Decision Tree Regressor
reg = DecisionTreeRegressor(random_state=2)

# Obtain scores of cross-validation using mean squared error
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1233.36


In [8]:
# Initialize and score DecisionTreeRegressor on training set
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)
from sklearn.metrics import mean_squared_error
reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

0.0

In [9]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Choose max_depth hyperparameters
params = {'max_depth':[None,2,3,4,6,8,10,20]}

# Initialize regression model as reg
reg = DecisionTreeRegressor(random_state=2)

# Initialize GridSearchCV as grid_reg
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Fit grid_reg on X_train and y_train
grid_reg.fit(X_train, y_train)

# Extract best parameters
best_params = grid_reg.best_params_

# Print best hyperparameters
print("Best params:", best_params)

Best params: {'max_depth': 6}


In [10]:
# Compute best score
best_score = np.sqrt(-grid_reg.best_score_)

# Print best score
print("Training score: {:.3f}".format(best_score))

Training score: 951.398


In [11]:
# Extract best model
best_model = grid_reg.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Import mean_squared_error from sklearn.metrics as MSE 
from sklearn.metrics import mean_squared_error

# Compute rmse_test
rmse_test = mean_squared_error(y_test, y_pred)**0.5

# Print rmse_test
print('Test score: {:.3f}'.format(rmse_test))

Test score: 864.670


In [12]:
# Create grid_search function
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X_train, y_train)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = grid_reg.predict(X_test)

    # Compute rmse_test
    rmse_test = mean_squared_error(y_test, y_pred)**0.5

    # Print rmse_test
    print('Test score: {:.3f}'.format(rmse_test))

In [13]:
X_train.shape

(548, 12)

In [14]:
grid_search(params={'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best params: {'min_samples_leaf': 8}
Training score: 896.083
Test score: 855.620


In [15]:
grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best params: {'max_depth': 6, 'min_samples_leaf': 2}
Training score: 870.396
Test score: 913.000


In [16]:
grid_search(params={'max_depth':[5,6,7,8,9],'min_samples_leaf':[3,5,7,9]})

Best params: {'max_depth': 9, 'min_samples_leaf': 7}
Training score: 888.905
Test score: 878.538


There are too many decision tree hyperparameters to consistently use them all. In my experience, max_depth, max_features, min_samples_leaf, max_leaf_nodes, min_impurity_decrease, and min_samples_split are often sufficient.

# Case Study - Heart Disease

In [17]:
# Upload heart.csv to dataFrame
df_heart = pd.read_csv('heart_disease.csv')

# Show first five rows
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
# split data into X and y
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [19]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier(random_state=2)

# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Display accuracy
print('Accuracy:', np.round(scores, 2))

# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


RandomizedSearch CLF function
When fine-tuning many hyperparameters, GridSearchCV can take too much time. The scikit-learn library provides RandomizedSearchCV as a wonderful alternative. RandomizedSearchCV works in the same way as GridSearchCV, but instead of trying all hyperparameters, it tries a random number of combinations. It's not meant to be exhaustive. It's meant to find the best combinations in limited time.

In [21]:
# Import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):

    # Instantiate GridSearchCV as grid_reg
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, 
                                  cv=5, n_jobs=-1, random_state=2)
    
    # Fit grid_reg on X_train and y_train
    rand_clf.fit(X_train, y_train)

    # Extract best estimator
    best_model = rand_clf.best_estimator_
    
    # Extract best score
    best_score = rand_clf.best_score_

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = best_model.predict(X_test)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print accuracy
    print('Test score: {:.3f}'.format(accuracy))
        
    # Return best model
    return best_model

In [25]:
randomized_search_clf(params={'criterion':['entropy', 'gini'],
                              'splitter':['random', 'best'],
                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],
                          'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
                          'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
                          'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                          'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
                          'max_depth':[None, 2,4,6,8],
                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
                         })

Training score: 0.798
Test score: 0.855




In [26]:
randomized_search_clf(params={'max_depth':[None, 6, 7],
'max_features':['auto', 0.78],
'max_leaf_nodes':[45, None],
'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],
'min_samples_split':[2, 9, 10],
'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],
},
runs=100)



Training score: 0.802
Test score: 0.868




In [27]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=0.78, max_leaf_nodes=45,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.045, min_samples_split=9,
            min_weight_fraction_leaf=0.06, presort=False, random_state=2,
            splitter='best')

# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Display accuracy
print('Accuracy:', np.round(scores, 2))

# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

TypeError: __init__() got an unexpected keyword argument 'min_impurity_split'

In [28]:
best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=0.78, max_leaf_nodes=45,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, presort=False,
                       random_state=2, splitter='best')
best_clf.fit(X, y)

TypeError: __init__() got an unexpected keyword argument 'min_impurity_split'

In [29]:
best_clf.feature_importances_

NameError: name 'best_clf' is not defined

In [30]:
# Zip columns and feature_importances_ into dict
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

# Import operator
import operator

# Sort dict by values (as list of tuples)
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

NameError: name 'best_clf' is not defined