In [4]:
import pandas as pd
from sklearn import model_selection

In [55]:
class CrossValidation:
    def __init__(self,
               df,
               target_cols,
               shuffle,
               multilabel_delimiter,
               problem_type,
               num_folds=5,
               random_state=42
              ):
        self.dataframe=df
        self.target_cols=target_cols
        self.num_targets=len(target_cols)
        self.num_folds=num_folds
        self.shuffle=shuffle
        self.random_state=random_state
        self.problem_type=problem_type
        
        if self.shuffle is True:
            self.dataframe=self.dataframe.sample(frac=1).reset_index(drop=True)
        self.dataframe["kfold"]=-1
        
    def split(self):
        if self.problem_type in ("binary_classification","multi_class_classification"):
            if self.num_targets != 1:
                raise Exception("Invalid number of targets for this problem type")
            target=self.target_cols[0]
            unique_values=self.dataframe[target].nunique()
            
            if unique_values==1:
                raise Exception("Only one unique value found!")
            elif unique_values>1:
                kf= model_selection.StratifiedKFold(n_splits=self.num_folds, 
                                                     shuffle=False)
                for fold,(train_idx,val_idx) in enumerate(kf.split(X=self.dataframe,y=self.dataframe[target].values)):
                    self.dataframe.loc[val_idx, 'kfold'] = fold
                    
        elif self.problem_type in ("single_col_reg","multiple_col_reg"):
            if self.num_targets !=1:
                raise Exception("Invalid number of targets for this problem type")
            target=self.target_cols[0]
            kf=model_selection.KFold(n_splits=self.num_folds,shuffle=False)
            for fold,(train_idx,val_idx) in enumerate (kf.split(X=self.dataframe,y=self.dataframe[target].values)):
                self.dataframe.loc[val_idx,'kfold']=fold
                
                elif self.problem_type.startswith("holdout_"):
            holdout_percentage = int(self.problem_type.split("_")[1])
            num_holdout_samples = int(len(self.dataframe) * holdout_percentage / 100)
            self.dataframe.loc[:len(self.dataframe) - num_holdout_samples, "kfold"] = 0
            self.dataframe.loc[len(self.dataframe) - num_holdout_samples:, "kfold"] = 1

        elif self.problem_type == "multilabel_classification":
            if self.num_targets != 1:
                raise Exception("Invalid number of targets for this problem type")
            targets = self.dataframe[self.target_cols[0]].apply(lambda x: len(str(x).split(self.multilabel_delimiter)))
            kf = model_selection.StratifiedKFold(n_splits=self.num_folds)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe, y=targets)):
                self.dataframe.loc[val_idx, 'kfold'] = fold
                
        else:
            raise Exception("Problem type not understood")
                           
        return self.dataframe   

In [56]:
df=pd.read_csv("F:/Git/House Prediction/train.csv")
cv=CrossValidation(df,
                   target_cols=["SalePrice"],
                   shuffle=False,
                   problem_type="single_col_reg"
                  )
df_split=cv.split()
print(df_split.kfold.value_counts())

4    292
3    292
2    292
1    292
0    292
Name: kfold, dtype: int64


In [18]:
df_split.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,kfold
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,-1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,-1
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,-1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,-1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,-1


In [39]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,kfold
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,-1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,-1
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,-1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,-1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,-1


In [41]:
kf=model_selection.KFold(n_splits=5,shuffle=False)

In [51]:
for fold,(train_idx,val_idx) in enumerate (kf.split(X=df,y=df.SalePrice)):
    df.loc[val_idx,'kfold']=fold

In [52]:
print(df_split.kfold.value_counts())

4    292
3    292
2    292
1    292
0    292
Name: kfold, dtype: int64
