# Stratified Spliting

## Loading Data

In [None]:
%pylab inline
import pickle
#Loading the splited MSRP corpus
data = open('data/cleaned-scaled-data.pkl','rb')
X, Y = pickle.load(data)

## Stratified Split

In order to achieve equal class distribution in the training and the test sets.

Code from "Python Data Science Cookbook", page 538, Subramanian2015.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_train_test(x,y):
    """
    Perpare a stratified train and test split
    """
    train_size = 0.7
    test_size = 1-train_size
    #input_dataset = np.column_stack([x,y])
    stratified_split = StratifiedShuffleSplit(n_splits=5,test_size=test_size)
    
    for train_index,test_index in stratified_split.split(x,y):
        train_x, test_x = x[train_index], x[test_index]
        train_y, test_y = y[train_index], y[test_index]
    return train_x,train_y,test_x,test_y

Xs_train, Ys_train, Xs_test, Ys_test = get_train_test(X,Y)

In [None]:
def get_class_distribution(y):
    """Given an array of class labels
    Return the class distribution
    """
    distribution = {}
    set_y = set(y)
    
    for y_label in set_y:
        no_elements = len(np.where(y == y_label)[0])
        distribution[y_label] = no_elements
    dist_percentage = {class_label: count/(1.0*sum(distribution.values())) 
                       for class_label,count in distribution.items()}
    return dist_percentage

In [None]:
def print_class_label_split(train,test):
    """
    Print the class distribution
    in test and train dataset
    """
    y_train = train[:,-1]
    
    train_distribution = get_class_distribution(y_train)
    print("\nTrain dat set class label distribution")
    print("=========================================\n")
    for k,v in train_distribution.items():
        print("Class label =%d, percentage records =%.2f"%(k,v))
    
    y_test = test[:,-1]
    
    test_distribution = get_class_distribution(y_test)
    
    print("\nTest data set class label distribution")
    print("=========================================\n")
    
    for k,v in test_distribution.items():
        print("Class label =%d, percentage records =%.2f"%(k,v))

In [None]:
print_class_label_split(train,test)

In [None]:
Xs_test.shape,Xs_train.shape, Ys_test.shape, Ys_train.shape

In [None]:
data = [Xs_train, Xs_test, Ys_train, Ys_test]
pickle.dump(data, open('data/strat-split-data.pkl', 'wb'))

## Stratified KFold

In [None]:
from sklearn.model_selection import KFold,StratifiedKFold

In [None]:
def class_distribution(y):
    class_dist = {}
    total = 0
    for entry in y:
        try:
            class_dist[entry]+=1
        except KeyError:
            class_dist[entry]=1
        total+=1
    
    for k,v in class_dist.items():
        print ("\tclass %d percentage =%0.2f"%(k,v/(1.0*total)))

In [None]:
#3 folds
kfolds = KFold(n_splits=3)
fold_count = 1
for train,test in kfolds.split(X, Y):
    print("Fold %d x train shape"%(fold_count),X[train].shape,\
    " x test shape",X[test].shape)
    fold_count==1

#Stratified KFold
skfolds = StratifiedKFold(n_splits=3)
fold_count = 1
Yk_train = {}
for train,test in skfolds.split(X, Y):
    print("\nFold %d x train shape"%(fold_count),X[train].shape,\
    " x test shape",X[test].shape)
    Yk_train[fold_count] = Y[train]
    Xk_train = X[train]
    Yk_test = Y[test]
    Xk_test = X[test]
    print ("Train Class Distribution")
    class_distribution(Yk_train)
    print ("Test Class Distribution")
    class_distribution(Yk_test)
    fold_count+=1

In [None]:
type(Xk_test)
Yk_train[1].shape

In [None]:
data = [Xk_train, Xk_test, Yk_train, Yk_test]
pickle.dump(data, open('data/strat-ksplit-data.pkl', 'wb'))