# Stratified Spliting

## Loading Data

In [2]:
%pylab inline
import pickle
#Loading the splited MSRP corpus
data = open('data/cleaned-scaled-data.pkl','rb')
X, Y = pickle.load(data)
data = open('data/clean-scaled-positive-data.pkl', 'rb')
Xs, Ys = pickle.load(data)

Populating the interactive namespace from numpy and matplotlib


## Stratified Split

In order to achieve equal class distribution in the training and the test sets.

Code from "Python Data Science Cookbook", page 538, Subramanian2015.

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_train_test(x,y):
    """
    Perpare a stratified train and test split
    """
    train_size = 0.7
    test_size = 1-train_size
    stratified_split = StratifiedShuffleSplit(n_splits=5,test_size=test_size)
    
    for train_index,test_index in stratified_split.split(x,y):
        train_x, test_x = x[train_index], x[test_index]
        train_y, test_y = y[train_index], y[test_index]
    return train_x,train_y,test_x,test_y

Xs_train, Ys_train, Xs_test, Ys_test = get_train_test(Xs,Ys)

In [28]:
def get_class_distribution(y):
    """Given an array of class labels
    Return the class distribution
    """
    distribution = {}
    set_y = set(y)
    
    for y_label in set_y:
        no_elements = len(np.where(y == y_label)[0])
        distribution[y_label] = no_elements
    dataset_size = 0
    for class_label, count in distribution.items():
        dataset_size += distribution[class_label]
    dist_percentage = {class_label: count/(1.0*dataset_size) 
                       for class_label,count in distribution.items()}
    return dist_percentage

In [29]:
def print_class_label_split(train,test):
    """
    Print the class distribution
    in test and train dataset
    """
    y_train = train
    
    train_distribution = get_class_distribution(y_train)
    print("\nTrain dat set class label distribution")
    print("=========================================\n")
    for k,v in train_distribution.items():
        print("Class label =%d, percentage records =%.2f"%(k,v))
    
    y_test = test
    
    test_distribution = get_class_distribution(y_test)
    
    print("\nTest data set class label distribution")
    print("=========================================\n")
    
    for k,v in test_distribution.items():
        print("Class label =%d, percentage records =%.2f"%(k,v))

In [30]:
print_class_label_split(Ys_train,Ys_test)


Train dat set class label distribution

Class label =0, percentage records =0.33
Class label =1, percentage records =0.67

Test data set class label distribution

Class label =0, percentage records =0.33
Class label =1, percentage records =0.67


In [31]:
Xs_test.shape,Xs_train.shape, Ys_test.shape, Ys_train.shape

((1736, 43), (4050, 43), (1736,), (4050,))

In [51]:
data = [Xs_train, Xs_test, Ys_train, Ys_test]
pickle.dump(data, open('data/strat-split-data.pkl', 'wb'))

## Stratified KFold

In [33]:
from sklearn.model_selection import KFold,StratifiedKFold

In [34]:
def class_distribution(y):
    class_dist = {}
    total = 0
    for entry in y:
        try:
            class_dist[entry]+=1
        except KeyError:
            class_dist[entry]=1
        total+=1
    
    for k,v in class_dist.items():
        print ("\tclass %d percentage =%0.2f"%(k,v/(1.0*total)))

In [38]:
#3 folds
kfolds = KFold(n_splits=10)
fold_count = 1
for train,test in kfolds.split(X, Y):
    print("Fold %d x train shape"%(fold_count),X[train].shape,\
    " x test shape",X[test].shape)
    fold_count+=1

#Stratified KFold
skfolds = StratifiedKFold(n_splits=10)
fold_count = 1
Yk_train = {}
for train,test in skfolds.split(X, Y):
    print("\nFold %d x train shape"%(fold_count),X[train].shape,\
    " x test shape",X[test].shape)
    Yk_train = Y[train]
    Xk_train = X[train]
    Yk_test = Y[test]
    Xk_test = X[test]
    print ("Train Class Distribution")
    class_distribution(Yk_train)
    print ("Test Class Distribution")
    class_distribution(Yk_test)
    fold_count+=1

Fold 1 x train shape (5207, 43)  x test shape (579, 43)
Fold 2 x train shape (5207, 43)  x test shape (579, 43)
Fold 3 x train shape (5207, 43)  x test shape (579, 43)
Fold 4 x train shape (5207, 43)  x test shape (579, 43)
Fold 5 x train shape (5207, 43)  x test shape (579, 43)
Fold 6 x train shape (5207, 43)  x test shape (579, 43)
Fold 7 x train shape (5208, 43)  x test shape (578, 43)
Fold 8 x train shape (5208, 43)  x test shape (578, 43)
Fold 9 x train shape (5208, 43)  x test shape (578, 43)
Fold 10 x train shape (5208, 43)  x test shape (578, 43)

Fold 1 x train shape (5207, 43)  x test shape (579, 43)
Train Class Distribution
	class 1 percentage =0.67
	class 0 percentage =0.33
Test Class Distribution
	class 1 percentage =0.67
	class 0 percentage =0.33

Fold 2 x train shape (5207, 43)  x test shape (579, 43)
Train Class Distribution
	class 1 percentage =0.67
	class 0 percentage =0.33
Test Class Distribution
	class 1 percentage =0.67
	class 0 percentage =0.33

Fold 3 x train sha

In [50]:
data = [Xk_train, Xk_test, Yk_train, Yk_test]
pickle.dump(data, open('data/strat-10ksplit-data.pkl', 'wb'))