In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Continuing from previous lab, let's apply Z-Scoring and outlier fixing to our features
y = df['Outcome']
x = df.iloc[:, :-1]
stats = x.describe().T
mu = stats['mean']
sigma = stats['std']
x = (x-mu)/sigma
for column in x.columns:
    column_max = x[column][x[column] <= 3].max()
    column_min = x[column][x[column] >= -3].min()
    x.loc[x[column] > 3, column] = column_max
    x.loc[x[column] < -3, column] = column_min
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.63953,0.847771,0.149543,0.906679,-0.692439,0.20388,0.468187,1.425067
1,-0.844335,-1.122665,-0.160441,0.530556,-0.692439,-0.683976,-0.364823,-0.190548
2,1.233077,1.942458,-0.263769,-1.287373,-0.692439,-1.102537,0.604004,-0.105515
3,-0.844335,-0.997558,-0.160441,0.154433,0.123221,-0.493721,-0.920163,-1.040871
4,-1.141108,0.503727,-1.503707,0.906679,0.765337,1.408828,2.985325,-0.020483


In [3]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size=0.2)

In [4]:
xtrain.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
565,-0.547562,-0.809897,-0.78041,-0.409752,0.071158,-0.747395,0.833383,-0.955839
273,-0.844335,-1.560539,0.459528,1.846987,-0.301963,0.153145,-0.150534,-1.040871
300,-1.141108,1.44203,-2.330333,-1.287373,-0.692439,0.038992,1.108035,-0.27558
746,-0.844335,0.816495,1.286153,1.282802,-0.692439,2.195214,-0.343696,-0.530677
215,2.420169,0.941602,0.046215,1.220115,1.659093,1.24394,0.815274,0.404679


In [5]:
xtest.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
747,-0.844335,-1.247772,0.252871,1.282802,-0.197837,1.814705,1.8837,-0.105515
146,1.52985,-1.998414,0.562856,1.032053,-0.692439,0.102411,-1.134452,0.659776
460,1.52985,-0.027978,0.149543,0.091745,-0.206514,-1.419628,0.788111,1.255002
50,-0.844335,-0.559683,0.562856,-0.597814,0.019094,-1.5972,0.057718,-0.955839
59,-1.141108,-0.497129,-0.263769,1.282802,0.539729,1.205889,-0.902054,-0.955839


In [6]:
# let's first try a Gradient Descent model
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
model = SGDClassifier()
model.fit(xtrain, ytrain)
p = model.predict(xtest)
accuracy_score(p, ytest)

0.7727272727272727

In [8]:
# Hyperparameters are values that you as developers adjust to tweak the model
# learning rate is the factor with which a model reduces its error and attempts
# to optimize itself
# Maximum iterations is how many times model will attempt to retrain and fix error
hp_max_iter = 1000
hp_lr_type = ['constant', 'adaptive', 'optimal']
hp_lr = [1e-3, 0.1, 3e-3, 1]

m1 = SGDClassifier(max_iter=hp_max_iter, tol=hp_lr[0], learning_rate=hp_lr_type[0], eta0=0.01)
m1.fit(xtrain,ytrain)
p = m1.predict(xtest)
a = accuracy_score(p, ytest)
print('With constant learning rate and initial learning rate of 0.001, accuracy= ' + str(a))

m2 = SGDClassifier(max_iter=hp_max_iter, learning_rate=hp_lr_type[2], eta0=0.01)
m2.fit(xtrain,ytrain)
p = m2.predict(xtest)
a = accuracy_score(p, ytest)
print('With optimal learning rate and initial learning rate ignored, accuracy= ' + str(a))

m3 = SGDClassifier(max_iter=hp_max_iter, tol=hp_lr[2], learning_rate=hp_lr_type[1], eta0=0.01)
m3.fit(xtrain,ytrain)
p = m3.predict(xtest)
a = accuracy_score(p, ytest)
print('With adaptive learning rate and initial learning rate of 0.003, accuracy= ' + str(a))

m4 = SGDClassifier(max_iter=hp_max_iter, tol=hp_lr[0], learning_rate=hp_lr_type[1], eta0=0.01)
m4.fit(xtrain,ytrain)
p = m4.predict(xtest)
a = accuracy_score(p, ytest)
print('With adaptive learning rate and initial learning rate of 1, accuracy= ' + str(a))


With constant learning rate and initial learning rate of 0.001, accuracy= 0.7532467532467533
With optimal learning rate and initial learning rate ignored, accuracy= 0.7077922077922078
With adaptive learning rate and initial learning rate of 0.003, accuracy= 0.7597402597402597
With adaptive learning rate and initial learning rate of 1, accuracy= 0.7597402597402597


In [None]:
# 'constant': Keeps the learning rate constant at the value specified by eta0.
# This is a good option if you already know a good learning rate for your data and problem.

# 'optimal': Uses a formula to set the learning rate to 1.0 / (alpha * (t + t0))
# where t is the time step (iteration) and t0 is chosen based on a heuristic proposed by Leon Bottou.
# The alpha is the inverse of the regularization strength.
# Note that when using 'optimal', the eta0 parameter is ignored.
# This option can be a good starting point without the need to tune the
# learning rate, but it may not always be the best choice for all problems.

# 'adaptive': Keeps the learning rate constant as long as training loss keeps decreasing.
# Each time two consecutive epochs fail to decrease training loss by a certain tolerance,
# or fail to increase validation score by a certain tolerance if early_stopping is True,
# the learning rate is divided by 5. This strategy can be useful when you want to
# finely adjust the learning rate in later stages of training.
# For 'adaptive', the eta0 and tol parameters are relevant.

In [None]:
# Now let's try some other algorithm, maybe decision trees
from sklearn.tree import DecisionTreeClassifier
# how deep is a tree is a good hyperparameter.
# let's create various depths of trees
hp_depth = [ 10, 20, 30, 50, 90, 100, 150, 250, 350, 500, 600, 750 ]

for depth in hp_depth:
    model = DecisionTreeClassifier(max_depth= depth)
    model.fit(xtrain, ytrain)
    p = model.predict(xtest)
    a= accuracy_score(p, ytest)
    print('With depth = ' + str(depth) +' acc = ' + str(a))


In [None]:
# let's add an additional layer of how many features to consider in a tree

hp_max_features = [1,2,3,4,5,6,7,8]
hp_depth = [ 10, 20, 30, 50, 90, 100, 150, 250, 350, 500, 600, 750 ]

# Example of a Grid Search
# grid search is an exhaustive search on all possible combinations of given hyperparameters
# we have a grid search library, but first let's observe how does the grid look like

max_accuracy = -1
best_max_feature = 0
best_depth = 0

for max_feat in hp_max_features:
    for depth in hp_depth:
        model = DecisionTreeClassifier(max_depth= depth, max_features=max_feat)
        model.fit(xtrain, ytrain)
        p = model.predict(xtest)
        a= accuracy_score(p, ytest)
        print('With depth = ' + str(depth) +' and max features= ' + str(max_feat)+ ' acc = ' + str(a))
        if(a > max_accuracy):
            max_accuracy = a
            best_max_feature = max_feat
            best_depth = depth

print()
print('Best Accuracy = ' + str(max_accuracy))
print('At the depth of = ' + str(best_depth))
print('With features selected = ' + str(best_max_feature))

In [None]:
# Random forest generates a lot of decision trees, and then goes with the majority decision
# it has n_estimators hyperparameter that controls how many trees will be generated
from sklearn.ensemble import RandomForestClassifier
hp_trees = [10, 50, 100, 150, 200, 250, 300, 350]



best_acc = -1
best_tree_count = 0

for trees in hp_trees:
    model = RandomForestClassifier(n_estimators= trees)
    model.fit(xtrain, ytrain)
    p = model.predict(xtest)
    a = accuracy_score(p, ytest)
    if a > best_acc:
        best_acc = a
        best_tree_count = trees
    print('With tree count = ' + str(trees) + ' , acc = ' + str(a))

print()
print('Best model was with estimators = ' + str(best_tree_count))

# everytime you run, the results could be different as trees are generated randomly
# the best model is a 'STATE' of the model, not a particular set of hyperparameters

In [None]:
# Another way to search for Hyperparameters besides Grid Search is Random Search
# we may not get the 'best' hyperparameter set, but we will get the most 'workable' set

# Use grid search when hyperparameters are finite
# otherwise, random search can also provide decent answers

# this will take some time to execute while the search happens

from sklearn.model_selection import RandomizedSearchCV
import numpy as np
param_distributions = {
    'n_estimators': np.arange(10, 200, 10),  # Searching over a range from 10 to 190 inclusive, in steps of 10
    'max_depth': [None] + list(np.arange(5, 50, 5)),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier()

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=100)

# Fit it to the data
random_search.fit(xtrain, ytrain)

# Print the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

In [None]:
#Gini Impurity
# Gini impurity measures the likelihood of incorrect classification of a
# new element assuming that the element
# is randomly labeled according to the distribution of labels in the set.
# A Gini score gives an idea of how often a randomly chosen element from the set would
# be incorrectly labeled if it was randomly labeled according to
# the distribution of labels in the subset.
# A Gini score of 0 means all elements belong to a single class, implying perfect purity.

#Entropy
#Entropy is a concept borrowed from information theory.
# It represents the amount of information disorder or uncertainty.
#In the context of decision trees, entropy helps determine how a split will
# organize the data by measuring the impurity in a group of examples.
#Entropy is 0 when all samples in a group belong to the same class.

#Comparison
#Both Gini impurity and entropy are used for the same purpose: to choose the
# splits in a decision tree.
#Gini impurity is slightly faster to compute, so it's a good default choice.
#Entropy might produce slightly more balanced trees,
# but it's computationally more intensive due to the logarithm function.