In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset1/master/winequality-red.csv')
X = df.drop(["quality"], axis=1)
y = df['quality']

In [161]:
class CrossValidation:
    def __init__ (self, model, X, y, scoring, cv = 5, seed = 42):
        import numpy as np
        import pandas as pd
        """Initializing the variables"""
        self.model = model
        self.X = X
        self.y = y
        self.scoring = scoring
        self.cv = cv
        np.random.seed = seed
        
    def pairwise(self,iterable):
        import itertools
        "[s0,s1,s2,s3...] -> (s0,s1), (s1,s2), (s2, s3), ..."
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    
    def SplitData(self):
        """This method will retrun the split of data based on values(target variable)"""
        self.df = X.copy()
        self.df["Y"] = y
        #print(self.df.dtypes)
        splitDataDict = {}
        y_values_index = y.value_counts().index
        y_values = y.value_counts().values
        for i,j in zip(y_values_index, y_values):
            tempData = self.df[self.df["Y"] == i]
            splitDataDict[i] = tempData.sample(j).reset_index(drop=True)
            
        return splitDataDict
    
    def stratifySplit(self):
        """Stratigically split the data with respect to the cross validation"""
        splitDataDict = self.SplitData() 
        StratifyDict = dict()
        for splitNo in range(self.cv):
            tempDataFrame = pd.DataFrame(columns=self.df.columns)
            StratifyDict[splitNo] = tempDataFrame
            # print(tempDataFrame)
        for col,val in splitDataDict.items():
            tempRange = range(0,val.shape[0], [1 if val.shape[0]//self.cv == 0 else val.shape[0]//self.cv][0])
            #print(list(tempRange))
            for split, key in zip(self.pairwise(tempRange),StratifyDict.keys()):
                splitDataStratified = val.loc[range(split[0],split[1])]
                # print(StratifyDict[key].shape, splitDataStratified.shape)
                StratifyDict[key] = StratifyDict[key].append(splitDataStratified, ignore_index=True)

        return StratifyDict
    
    def retrunPerformanceParams(self):
        """can add different parameters"""
        from sklearn.metrics import accuracy_score, r2_score,f1_score
        params = None
        if self.scoring == "accuracy":
            params = accuracy_score
            return params
        elif self.scoring == 'r2':
            params = r2_score
            return params
        elif self.scoring == "f1":
            params = f1_score
            return params
        
    
    
    def Validate(self):
        import copy
        dataDict = self.stratifySplit()
        PerformanceMetric = self.retrunPerformanceParams()
        finalScore = []
        finalScoreCount = []
        CVList = list(dataDict.keys())
        for en, testSet in enumerate(CVList):
            tempList = copy.copy(CVList)
            tempList.pop(testSet)
            testData = dataDict[testSet].reset_index(drop = True)
            trainData = dataDict[tempList[0]].reset_index(drop=True)
            for tempTrain in range(1, len(tempList)+1):
                trainData = trainData.append(dataDict[tempTrain], ignore_index=True)
            X_train, y_train, X_test, y_test = trainData.drop(columns=["Y"]),trainData["Y"],testData.drop(columns=["Y"]),testData["Y"]
            y_train, y_test = pd.to_numeric(y_train),pd.to_numeric(y_test)
            self.model.fit(X_train, y_train)
            pred = self.model.predict(X_test)
            score = PerformanceMetric(y_test, pred)
            finalScore.append(score)
            finalScoreCount.append(f"Iteration - {en+1}")
            # print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, score, y_test.value_counts())
        finalScore.append(sum(finalScore)/len(finalScore))
        finalScoreCount.append("CV")
        return pd.DataFrame({"Scores":finalScore}, index = finalScoreCount)
        
        
    def __str__(self):
        return "This is a test algorithm"

In [162]:
from sklearn.tree import DecisionTreeClassifier

c = CrossValidation(DecisionTreeClassifier(), X,y, "accuracy")
c.Validate()

Unnamed: 0,Scores
Iteration - 1,0.213075
Iteration - 2,1.0
Iteration - 3,1.0
Iteration - 4,1.0
Iteration - 5,1.0
CV,0.842615


In [163]:
df_1 = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/abalone.csv")
df_1 = df_1.drop(["Sex"], axis=1)

In [164]:
df_1.isna().sum()

Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [165]:
df_1.dtypes

Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
Rings               int64
dtype: object

In [166]:
X  = df_1.drop(["Rings"], axis=1)
y = df_1["Rings"]

In [167]:
from sklearn.neighbors import KNeighborsClassifier

In [168]:
newClass = CrossValidation(KNeighborsClassifier(),X,y,"accuracy")
newClass.Validate()

Unnamed: 0,Scores
Iteration - 1,0.211864
Iteration - 2,0.444175
Iteration - 3,0.459951
Iteration - 4,0.458738
Iteration - 5,0.451128
CV,0.405171


In [170]:
1 if 1 ==1 else 0 # got an error in final testing so used this logic in line 43

1