In [None]:
import numpy as np 
import pandas as pd
from sklearn import model_selection

In [None]:
def create_folds(data,target,num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data[target], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
train_df= pd.read_csv("../input/happy-whale-and-dolphin/train.csv")

In [None]:
train_df

In [None]:
## find duplicates
train_df.species.unique()
## Find the duplicactes and merge

print("Total species before finding duplicates :",len(train_df.species.unique()))
train_df.species = train_df.species.str.replace('kiler_whale','killer_whale')
train_df.species = train_df.species.str.replace('bottlenose_dolpin','bottlenose_dolphin')
train_df['species'][(train_df['species'] =="pilot_whale") | (train_df['species'] =="globis" )]='short_finned_pilot_whale'
print("Total species after :",len(train_df.species.unique()))

In [None]:
## lets convert labels into numbers
## create dictionary

species = dict((a,b) for b,a in enumerate(train_df.species.unique()))
species_inv = {(a,b) for b,a in species.items()}

In [None]:
individual = dict((a,b) for b,a in enumerate(train_df.individual_id.unique()))
individual_inv = {(a,b) for b,a in species.items()}

In [None]:
train_df["species"]= [species[i] for i in train_df["species"]]

In [None]:
train_df['individual_id']=[individual[i] for i in train_df['individual_id']]

In [None]:
train_df

In [None]:
train_df=create_folds(train_df,"species",10)

In [None]:
train_df.agg(['min','max','count','nunique'])

In [None]:
train_df.to_csv("train.csv",index=False)

In [None]:
## dont forget the steps we done in the preprocessing we actually neeed it when deploying