In [2]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection

In [5]:
def create_folds(data):
    # Making a new column in the dataset with name kfold and filling it with value -1
    data['kfold']= -1
    
    # Randomizing the rows in the dataset and also resetting the index
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Calculating the number of bins to be made in the dataset using Sturge's rule
    
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:,'bins'] = pd.cut(data['target'],bins=num_bins,labels=False)
    
    # Initializing the stratified k-fold from the model_selection package in sklearn library
    kf = model_selection.StratifiedKFold(n_splits=5) # splitting the dataset into 5 equal parts
    
    # Filling the new kfold column
    # Using the bins as the target
    for fold, (train_,val_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[val_,'kfold']=fold
    
    # Dropping the bins column
    data = data.drop('bins',axis=1)
    return data

In [9]:
X,y = datasets.make_regression(n_samples=15000,n_features=50,n_targets=1)

dataframe = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
dataframe.loc[:,'target'] = y

In [10]:
# Creating the folds by calling the function
data = create_folds(dataframe)

data.head(10)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,target,kfold
0,2.492325,1.070852,0.47736,-0.620012,0.729972,-0.053869,1.397471,0.709393,-0.065964,1.340829,...,2.193122,-0.554792,-0.371128,1.275036,0.006246,0.348745,0.568431,-0.201032,-81.775897,0
1,-0.749186,2.561816,-2.488821,0.420585,-1.343083,-0.762133,0.240154,-0.075503,-0.521662,0.29635,...,-0.791403,0.233124,1.661573,-0.750208,-0.895977,-1.462385,0.678136,0.491582,-104.970644,0
2,1.907534,1.926741,-0.038297,-2.302338,0.553651,1.991752,-0.760846,-1.472087,-0.647947,-2.248676,...,1.062029,0.896338,-0.047607,0.011817,0.574407,0.114865,2.424726,0.62935,-43.604316,0
3,-1.081938,-0.516089,1.017909,-1.517974,0.006111,0.108015,-0.653014,1.726017,0.387743,0.20424,...,0.636436,-0.343777,-1.449525,-1.119866,-0.515269,-1.446118,0.857717,-1.658295,-377.259493,0
4,1.038984,-0.662497,0.542178,-0.776799,0.92751,-0.003449,1.030087,0.126624,-0.384758,0.783741,...,-1.09773,0.820468,0.751257,1.245949,0.126868,1.104906,-0.161697,-0.680615,4.874714,0
5,0.460903,-1.777217,0.028865,-0.344065,-0.616947,2.019348,-0.330181,1.093486,-1.217125,0.828434,...,0.483543,0.104807,-0.275312,0.249059,2.322166,0.737168,1.551121,0.668277,-231.77261,0
6,1.576397,0.133422,0.190395,2.853154,0.426041,0.369636,-0.287523,1.2859,-0.821442,0.315292,...,-0.484574,-0.323461,1.365866,0.156961,0.122007,0.349755,0.54488,-0.006968,259.906492,0
7,0.078603,0.395332,-1.499116,1.461926,-0.41333,-0.29263,-1.04873,-0.509611,0.215941,0.054956,...,0.191444,0.540572,-0.629865,-1.811889,-1.80579,0.76667,-0.646485,1.670798,370.762923,0
8,0.292958,-1.043416,1.13603,1.62793,2.454502,0.591528,0.592256,-1.002885,-1.278983,1.686943,...,-0.684864,1.714899,1.633021,0.67171,0.076106,-0.482084,0.984215,0.59392,177.63191,0
9,0.451194,-1.105537,0.377384,-1.127186,0.873496,-2.517326,0.448561,0.408196,-1.11205,0.361039,...,1.714515,-1.816804,-0.438696,0.639707,-1.547553,0.075843,-0.611069,-0.362763,-345.812621,0
