## 5-Fold CV Splits Generation
This file creates splits the data set into 5 folds for a 5-fold CV.

In [1]:
import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit


os.chdir("/Users/joejohnson/Documents/Research/Unilever/ssnc_absorb_model/splits_stratified_k_fold")

unilever_1_df = pd.read_csv("ingred_all_fields.csv")
print(len(unilever_1_df))

124


In [7]:
## augment data set with new feature - occlusive + elastomer

import numpy as np

unilever_1_df["occlus_elast_sum"] = unilever_1_df["occlusive"] + unilever_1_df["elastomer"]
unilever_1_df["occlus_elast_cat"] = np.ceil((np.log(unilever_1_df["occlus_elast_sum"] + 1)) - 0.45)
unilever_1_df["occlus_elast_cat"].where(unilever_1_df["occlus_elast_cat"] < 4, 4, inplace=True)

print(unilever_1_df["occlus_elast_cat"].value_counts(sort = False))
print(len(unilever_1_df))

# create dataframe with only those records for which occlusive_cat < 4 (filters out the records with
# high occlusive value)
# unilever_2_df = unilever_1_df.loc[unilever_1_df["occlusive_cat"] < 4]

# print(unilever_2_df["occlusive_cat"].value_counts(sort = False))
# print(len(unilever_2_df))



-0.0    35
 1.0     8
 3.0    26
 4.0    40
 2.0    15
Name: occlus_elast_cat, dtype: int64
124


In [9]:
## augment data set with transformation on occlusive feature.

import numpy as np


unilever_1_df["occlusive_cat"] = np.ceil((np.log(unilever_1_df["occlusive"] + 1)) - 0.5)
unilever_1_df["occlusive_cat"].where(unilever_1_df["occlusive_cat"] < 4, 4, inplace=True)

# create dataframe with only those records for which occlusive_cat < 4 (filters out the records with
# high occlusive value)
unilever_2_df = unilever_1_df.loc[unilever_1_df["occlusive_cat"] < 4]

print(unilever_1_df["occlusive_cat"].value_counts(sort = False))
print(len(unilever_1_df))
print(unilever_2_df["occlusive_cat"].value_counts(sort = False))
print(len(unilever_2_df))



-0.0    69
 1.0    35
 3.0    10
 4.0     5
 2.0     5
Name: occlusive_cat, dtype: int64
124
-0.0    69
 1.0    35
 3.0    10
 2.0     5
Name: occlusive_cat, dtype: int64
119


In [4]:
def print_value_counts(df, df_name, fieldName):
    print(fieldName + ' Distribution: ' + df_name)
    print('Value \t Percentage')
    print(df[fieldName].value_counts() / len(df))
    print('Total count: ' + str(len(df)))
    print()


In [5]:
def createStratifiedShuffleSplits(df, extreme_data, strat_feature, dest_directory) :
    # creates 5-fold CV train/test 80/20 splits with stratified sampling, using a strat field supplied as an
    # input parameter.
    # 
    # Parameters:
    #
    # df -- the pandas dataframe to split into 5 folds
    # extreme_data -- boolean value - True for creating splits for the data having extreme values of 
    #                                        strat field
    #                                        False for creating splits for the data with non-extreme values
    # strat_feature - name of the field to do stratified sampling on. occlus_elast_cat, or occlusive_cat
    # dest_directory - destination directory for the splits files.
    
    print_value_counts(df, "Complete Data Set", strat_feature)
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    i = 1
    for train_index, test_index in split.split(df, df[strat_feature]):
        # create train/test split from the indexes produced by the 
        # call on the StratShuffleSplit.split() method.
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]

        # report the distribution of records across occlusive categories.
        # should be very similar to distribution for entire dataset.
        print_value_counts(strat_train_set, 'training set ' + str(i), strat_feature)
        print_value_counts(strat_test_set, 'test set ' + str(i), strat_feature)
        
        # filter in/out extreme occlus_elast_cat value data points from training set, test set
        if not extreme_data:
            non_extreme_occlus_elast_value = strat_train_set['occlus_elast_cat'] < 4
            strat_train_set = strat_train_set[non_extreme_occlus_elast_value]
            non_extreme_occlus_elast_value = strat_test_set['occlus_elast_cat'] < 4
            strat_test_set = strat_test_set[non_extreme_occlus_elast_value]
        else:
            extreme_occlus_elast_value = strat_train_set['occlus_elast_cat'] >= 4
            strat_train_set = strat_train_set[extreme_occlus_elast_value]
            extreme_occlus_elast_value = strat_test_set['occlus_elast_cat'] >= 4
            strat_test_set = strat_test_set[extreme_occlus_elast_value]
            
        # report the distribution of records across occlusive categories after removal
        # of extreme value data points.
        # should be very similar to distribution for entire dataset.
        print_value_counts(strat_train_set, 'training set ' + str(i) + ' after high occlus/elast data points removed', strat_feature)
        print_value_counts(strat_test_set, 'test set ' + str(i) + ' after high occlus/elast data points removed', strat_feature)

        # save each train/test set split to a .csv file.
        strat_train_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_train.csv')
        strat_test_set.to_csv(dest_directory + '/strat_fold_' + str(i) + '_test.csv')

        #increment index for next train/test split
        i += 1


In [9]:
# make 5-fold CV splits files based on occlus_elast_cat, non-extreme data

createStratifiedShuffleSplits(df = unilever_1_df, extreme_data = False, strat_feature = "occlus_elast_cat", dest_directory = './5_fold_stratified_splits_high_30_occlusives_elastomers_false')

# cannot make the no-occlusive train/test splits using the stratified shuffle splits approach - need to make those
# files manually by manually eliminating those records with occlusive_cat = 4.  The random split will not work 
# the way we want given that the records with occlusive_cat = 4 have been eliminated from the dataset.
# do not use this code!...
# createStratifiedShuffleSplits(df = unilever_2_df, fieldName = "occlusive_cat", directoryName = './5_fold_stratified_splits_no_high_occlusives')


occlus_elast_cat Distribution: Complete Data Set
Value 	 Percentage
 4.0    0.322581
-0.0    0.282258
 3.0    0.209677
 2.0    0.120968
 1.0    0.064516
Name: occlus_elast_cat, dtype: float64
Total count: 124

occlus_elast_cat Distribution: training set 1
Value 	 Percentage
 4.0    0.323232
-0.0    0.282828
 3.0    0.212121
 2.0    0.121212
 1.0    0.060606
Name: occlus_elast_cat, dtype: float64
Total count: 99

occlus_elast_cat Distribution: test set 1
Value 	 Percentage
 4.0    0.32
-0.0    0.28
 3.0    0.20
 2.0    0.12
 1.0    0.08
Name: occlus_elast_cat, dtype: float64
Total count: 25

occlus_elast_cat Distribution: training set 1 after high occlus/elast data points removed
Value 	 Percentage
-0.0    0.417910
 3.0    0.313433
 2.0    0.179104
 1.0    0.089552
Name: occlus_elast_cat, dtype: float64
Total count: 67

occlus_elast_cat Distribution: test set 1 after high occlus/elast data points removed
Value 	 Percentage
-0.0    0.411765
 3.0    0.294118
 2.0    0.176471
 1.0    0.117

In [11]:
# make 5-fold CV splits files based on occlus_elast_cat, extreme data

createStratifiedShuffleSplits(df = unilever_1_df, extreme_data = True, strat_feature = "occlus_elast_cat", dest_directory = './5_fold_stratified_splits_high_30_occlusives_elastomers_true')

# cannot make the no-occlusive train/test splits using the stratified shuffle splits approach - need to make those
# files manually by manually eliminating those records with occlusive_cat = 4.  The random split will not work 
# the way we want given that the records with occlusive_cat = 4 have been eliminated from the dataset.
# do not use this code!...
# createStratifiedShuffleSplits(df = unilever_2_df, fieldName = "occlusive_cat", directoryName = './5_fold_stratified_splits_no_high_occlusives')


occlus_elast_cat Distribution: Complete Data Set
Value 	 Percentage
 4.0    0.322581
-0.0    0.282258
 3.0    0.209677
 2.0    0.120968
 1.0    0.064516
Name: occlus_elast_cat, dtype: float64
Total count: 124

occlus_elast_cat Distribution: training set 1
Value 	 Percentage
 4.0    0.323232
-0.0    0.282828
 3.0    0.212121
 2.0    0.121212
 1.0    0.060606
Name: occlus_elast_cat, dtype: float64
Total count: 99

occlus_elast_cat Distribution: test set 1
Value 	 Percentage
 4.0    0.32
-0.0    0.28
 3.0    0.20
 2.0    0.12
 1.0    0.08
Name: occlus_elast_cat, dtype: float64
Total count: 25

occlus_elast_cat Distribution: training set 1 after high occlus/elast data points removed
Value 	 Percentage
4.0    1.0
Name: occlus_elast_cat, dtype: float64
Total count: 32

occlus_elast_cat Distribution: test set 1 after high occlus/elast data points removed
Value 	 Percentage
4.0    1.0
Name: occlus_elast_cat, dtype: float64
Total count: 8

occlus_elast_cat Distribution: training set 2
Value 	 P