In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
def createStratifiedShuffleSplits(df, extreme_data, strat_feature, dest_directory) :
    # creates 5-fold CV train/test 80/20 splits with stratified sampling, using a strat field supplied as an
    # input parameter.
    # 
    # Parameters:
    #
    # df -- the pandas dataframe to split into 5 folds
    # extreme_data -- boolean value - True for creating splits for the data having extreme values of 
    #                                        strat field
    #                                        False for creating splits for the data with non-extreme values
    # strat_feature - name of the field to do stratified sampling on. occlus_elast_cat, or occlusive_cat
    # dest_directory - destination directory for the splits files.
    
    #print_value_counts(df, "Complete Data Set", strat_feature)
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    i = 1
    for train_index, test_index in split.split(df, df[strat_feature]):
        # create train/test split from the indexes produced by the 
        # call on the StratShuffleSplit.split() method.
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]

        # report the distribution of records across occlusive categories.
        # should be very similar to distribution for entire dataset.
        print_value_counts(strat_train_set, 'training set ' + str(i), strat_feature)
        print_value_counts(strat_test_set, 'test set ' + str(i), strat_feature)
        
        # save each train/test set split to a .csv file.
        strat_train_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_train.csv')
        strat_test_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_test.csv')

        #increment index for next train/test split
        i += 1

In [4]:
def print_value_counts(df, df_name, fieldName):
    print(fieldName + ' Distribution: ' + df_name)
    print('Value \t Percentage')
    print(df[fieldName].value_counts() / len(df))
    print('Total count: ' + str(len(df)))
    print()

In [5]:
state_df = pd.read_csv('../RF_datasets/stateData.csv')

In [6]:
#createStratifiedShuffleSplits(df = state_df, extreme_data = True, strat_feature = "num deaths", dest_directory = '/Users/samsonweiner/Desktop/SeniorDesign/epidemic_modeling/SeniorDesign/state_rf')

In [7]:
X2 = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
Y2 = np.array([0, 0, 0, 1, 1, 1])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(X2, Y2)

5

In [16]:
for train_index, test_index in sss.split(X2, Y2):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [2 5 1 3] TEST: [0 4]
TRAIN: [0 4 3 1] TEST: [2 5]
TRAIN: [0 4 3 1] TEST: [2 5]
TRAIN: [5 4 1 2] TEST: [0 3]


In [10]:
state_df.head()

Unnamed: 0.1,Unnamed: 0,population,num deaths,education,unemployment,MME per cap
0,AL,4863300.0,364.7475,24.5,5.8,1037.82423
1,AK,741894.0,92.73675,29.0,6.9,757.650696
2,AZ,6931071.0,790.142094,28.4,5.4,849.237172
3,AR,2988248.0,176.306632,22.0,4.0,922.833575
4,CA,39250017.0,1923.250833,32.6,5.5,462.578769


In [11]:
X = state_df.iloc[:, 3:].values
Y = state_df.iloc[:, 2].values

In [14]:
sss = StratifiedShuffleSplit(n_splits=4, test_size=0.2, random_state=0)
sss.get_n_splits(X2, Y2)

4

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.1, random_state=42)

In [18]:
trainingSets = []
for i in range(5):
    x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size = 0.11, random_state=42)
    trainingSets.append((x_train_split, x_test_split, y_train_split, y_test_split))

In [34]:
i = 0
for i in range(5):
    df_train = pd.DataFrame(trainingSets[i][0])   
    df_test = pd.DataFrame(trainingSets[i][1])
    df_train[3] = trainingSets[i][2]
    df_test[3] = trainingSets[i][3]
    f = open('stratified_data/strat_fold_' + str(i) + '_train.csv', 'w+')
    f = open('stratified_data/strat_fold_' + str(i) + '_test.csv', 'w+')
    df_train.to_csv('stratified_data/strat_fold_' + str(i) + '_train.csv')
    df_test.to_csv('stratified_data/strat_fold_' + str(i) + '_test.csv')