In [40]:
import numpy as np
import pandas as pd

In [41]:
data = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [42]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

# Cross-Validation technique

### Goal:
### 1. To get the minimum score threshold
### 2. To understand what max score I can acheive from the dataset
### 3. To extract the best training and testing sample that can give the best score

In [43]:
#1. Decide which algorithm that you want to evaluate

from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

In [44]:
#To get the minimum score threshold
#We have cross validation score function
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score

scores = cross_val_score(modelAlgo,
                        features,
                        label,
                        cv=10)

scores

array([1.        , 1.        , 1.        , 0.93333333, 0.93333333,
       0.93333333, 0.8       , 0.93333333, 1.        , 1.        ])

In [45]:
#1. To get the minimum score threshold
print("Minimum Score Threshold is : ",scores.mean())

Minimum Score Threshold is :  0.9533333333333334


In [46]:
#2. To understand what max score I can acheive from the dataset
print("Max Score Achievable is: ",scores.max())

Max Score Achievable is:  1.0


In [47]:
### 3. To extract the best training and testing sample that can give the best score

In [48]:
### There exists multiple indirect ways to extract the sample.
### One of them is K-Fold Cross Validation Technique

In [49]:
#Method1: K-Fold Cross Validation Technique

#1. Initialize the algorithm'

from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

#2. Initialize K-Fold Function

from sklearn.model_selection import KFold
#Ensure n_splits = cv value
kfold = KFold(n_splits=10,
             shuffle=True,
             random_state = 1) #Goal of RS is to persist my KFOLD so that we can extract the samples

#3. Initialize For Loop to identify which sample split gives the best score

counter = 0

for train,test in kfold.split(features):
    #Counter will help track the sample
    counter = counter + 1
    
    #Extract training set and testing set
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    
    #Fit data with model algo
    modelAlgo.fit(X_train,y_train)
    
    if modelAlgo.score(X_test,y_test) >= 1.0:
        print("Test Score: {}, Train Score: {}, for Sample Split {}".format(modelAlgo.score(X_test,y_test),modelAlgo.score(X_train,y_train),counter))
        

Test Score: 1.0, Train Score: 0.9555555555555556, for Sample Split 1
Test Score: 1.0, Train Score: 0.9481481481481482, for Sample Split 3
Test Score: 1.0, Train Score: 0.9629629629629629, for Sample Split 5
Test Score: 1.0, Train Score: 0.9629629629629629, for Sample Split 7
Test Score: 1.0, Train Score: 0.9555555555555556, for Sample Split 9


In [50]:
#Extract the final Train Test Split --- Best Sample

from sklearn.model_selection import KFold
#Ensure n_splits = cv value
kfold = KFold(n_splits=10,
             shuffle=True,
             random_state = 1) #Goal of RS is to persist my KFOLD so that we can extract the samples

counter = 0

for train,test in kfold.split(features):
    #Counter will help track the sample
    counter = counter + 1
    
    if counter == 5:
        X_train,X_test,y_train,y_test = features[train],features[test],label[train],label[test]

In [51]:
#Deploy My Model

finalModel = LogisticRegression()
finalModel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
finalModel.score(X_train,y_train)

0.9629629629629629

In [23]:
finalModel.score(X_test,y_test)

1.0

In [24]:
len(X_train)

135

In [25]:
len(X_test)


15

In [26]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values
features

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [27]:
#control over test size
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10,
                            test_size=0.25,
                            random_state=1)

#3. Initialize For Loop to identify which sample split gives the best score
for i in sss.split(features,label):
    #Counter will help track the sample
    print (i)
    break


(array([137,  21, 133, 135,  74, 101,   2,  57, 105, 140,  45,  96, 120,
        85,  26,  79,  41,  30,  60, 128,  71,  36, 126,  32,  38, 104,
        69,  27, 122,  52,  25, 109,  23,  80,  70,  78, 146, 125,  77,
       147, 119, 116,  86,  58, 124, 132,  44, 117,  14,  97,  29,  39,
        59,  28, 141,  34, 130,  42,  46,  55, 139,  18,   3,  88,  35,
        31,  51, 138,  87,  89,   4,  22,  61, 103,  63,  83,  82, 118,
        20,  53, 110,  72,  90,  19,  17,  40,  94, 129, 144,  33,  67,
       123, 100,  24,  47, 114,  13, 115,  92,  56,  98, 131,  10, 143,
        81, 136,  48, 134,  43,  95,  37,  64], dtype=int64), array([113,   7,   0,  11,  93,  49,  99, 106,  12,  91, 107, 127, 121,
        54, 112,  76, 145,  73,  68,  66,  50, 108, 142,  84,   8,   9,
        15,  62, 148,   1,   5, 149,  75,   6,  16,  65, 111, 102],
      dtype=int64))


In [28]:
#If you want to take charge of the splits, you will need to use Method2.

In [29]:
#Method 2: StratifiedShuffleSplit

#1. Initialize the algorithm'

from sklearn.linear_model import LogisticRegression
modelAlgo = LogisticRegression()

#2. Initialize StratifiedShuffleSplit Function

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10,
                            test_size=0.25,
                            random_state=1)

#3. Initialize For Loop to identify which sample split gives the best score

counter = 0
features
for train,test in sss.split(features,label):
    #Counter will help track the sample
    counter = counter + 1
    
    #Extract training set and testing set
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    
    #Fit data with model algo
    modelAlgo.fit(X_train,y_train)
    
    if modelAlgo.score(X_test,y_test) >= 1.0:
        print("Test Score: {}, Train Score: {}, for Sample Split {}".format(modelAlgo.score(X_test,y_test),modelAlgo.score(X_train,y_train),counter))
        

Test Score: 1.0, Train Score: 0.9464285714285714, for Sample Split 2
Test Score: 1.0, Train Score: 0.9375, for Sample Split 9


In [30]:
#Extract the final Train Test Split --- Best Sample


from sklearn.model_selection import StratifiedShuffleSplit
#Ensure n_splits = cv value
sss = StratifiedShuffleSplit(n_splits=10,
             test_size=0.25,
             random_state = 1)

counter = 0

for train,test in sss.split(features,label):
    #Counter will help track the sample
    counter = counter + 1
    
    if counter == 2:
        X_train,X_test,y_train,y_test = features[train],features[test],label[train],label[test]

In [31]:
len(X_train)

112