In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
def createStratifiedShuffleSplits(df, extreme_data, strat_feature, dest_directory) :
    # creates 5-fold CV train/test 80/20 splits with stratified sampling, using a strat field supplied as an
    # input parameter.
    # 
    # Parameters:
    #
    # df -- the pandas dataframe to split into 5 folds
    # extreme_data -- boolean value - True for creating splits for the data having extreme values of 
    #                                        strat field
    #                                        False for creating splits for the data with non-extreme values
    # strat_feature - name of the field to do stratified sampling on. occlus_elast_cat, or occlusive_cat
    # dest_directory - destination directory for the splits files.
    
    #print_value_counts(df, "Complete Data Set", strat_feature)
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    i = 1
    for train_index, test_index in split.split(df, df[strat_feature]):
        # create train/test split from the indexes produced by the 
        # call on the StratShuffleSplit.split() method.
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]

        # report the distribution of records across occlusive categories.
        # should be very similar to distribution for entire dataset.
        print_value_counts(strat_train_set, 'training set ' + str(i), strat_feature)
        print_value_counts(strat_test_set, 'test set ' + str(i), strat_feature)
        
        # save each train/test set split to a .csv file.
        strat_train_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_train.csv')
        strat_test_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_test.csv')

        #increment index for next train/test split
        i += 1

In [3]:
def print_value_counts(df, df_name, fieldName):
    print(fieldName + ' Distribution: ' + df_name)
    print('Value \t Percentage')
    print(df[fieldName].value_counts() / len(df))
    print('Total count: ' + str(len(df)))
    print()

In [4]:
state_df = pd.read_csv('../RF_datasets/stateData.csv')

In [6]:
createStratifiedShuffleSplits(df = state_df, extreme_data = True, strat_feature = "strat", dest_directory = 'stratified_data')

strat Distribution: training set 1
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 40

strat Distribution: test set 1
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 10

strat Distribution: training set 2
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 40

strat Distribution: test set 2
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 10

strat Distribution: training set 3
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 40

strat Distribution: test set 3
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 10

strat Distribution: training set 4
Value 	 Percentage
5    0.2
4    0.2
3    0.2
2    0.2
1    0.2
Name: strat, dtype: float64
Total count: 40

stra