## 14-split-dataset

In [None]:
# import packages
import pandas as pd
import janitor
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# read data files and clean names

# archaeological soil samples
# this is the set of example soil particles
soil_data = pd.read_csv('data_raw/archaeological_soil_data.csv', sep = ',', skiprows=[1]).clean_names()

# lithic experimental samples
# this is the set of example stone particles
stone_data = pd.read_csv('data_raw/lithic_experimental_data.csv', sep = ',', skiprows=[1]).clean_names()

In [None]:
# create a function that performs the tasks that we have just done in steps
# takes the file locations of our two csvs and returns the full dataframe
def combine_data(soil_csv_location, stone_csv_location):
    soil_data = pd.read_csv(soil_csv_location, sep = ',', skiprows=[1]).clean_names()
    stone_data = pd.read_csv(stone_csv_location, sep = ',', skiprows=[1]).clean_names()
    soil_data['stone_soil'] = 'soil'
    stone_data['stone_soil'] = 'stone'
    stone_soil_data = pd.concat([soil_data, stone_data])
    stone_soil_data=stone_soil_data.reset_index(drop=True)
    return stone_soil_data

In [None]:
# use "combine_data" function to get the full dataset
full_data = combine_data('data_raw/archaeological_soil_data.csv', 'data_raw/lithic_experimental_data.csv')
print(full_data.head())
print(full_data.tail())

      id  img_id      da      dp  fwidth  flength  fthickness  elength  \
0  25611   10977  13.303  15.911  12.651   17.110      11.317   15.814   
1  48302   15470  12.578  16.192  12.966   16.210      11.119   14.483   
2  32915   12616  12.534  16.888  11.852   16.679       9.440   16.400   
3  22866   10293  12.242  16.833  12.716   17.865      10.748   15.674   
4  10277    7209  11.012  13.255   8.301   15.206       8.301   15.127   

   ethickness  ewidth  ...  w_t_ratio  t_w_ratio  chull_surface_area   sieve  \
0      11.309  12.542  ...      1.118      0.895             599.801  11.984   
1      11.091  13.021  ...      1.166      0.858             550.880  12.042   
2       9.584  11.033  ...      1.255      0.797             563.687  10.646   
3      10.197  12.019  ...      1.183      0.845             526.194  11.732   
4       8.187   8.187  ...      1.000      1.000             406.845   8.301   

   angularity  ellipticity  fiber_length  fiber_width  krumbein_rnd  \
0  

In [None]:
def split_dataset(data, train_size, random_state):
    """
    Split the input dataset into a random train dataset and a test dataset with required size
    
    Parmeters
    ---------
    data : numpy.dataframe
        the dataset needs to split
    train_size: numerical
        the size of training dataset you want
    random_state:
        the random seed that can store the sampling result
        
    Returns
    -------
    numpy.dataframe
        training dataset and test dataset
    """
    # check if the train_size is correct
    assert data.shape[0] > train_size, 'The train dataset size should be smaller than the full dataset'
    # calculate the train_size parameter for function 'train_test_split'
    train_size_proportion = train_size/data.shape[0]
    train_data, test_data = train_test_split(data, train_size = train_size_proportion, random_state = random_state)
    return train_data, test_data

In [None]:
# test the function
train_data, test_data = split_dataset(full_data, train_size = 62890, random_state = 1)

In [None]:
# preview train dataset
train_data.head()

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd,stone_soil
47597,11680,7936,0.159,0.179,0.112,0.221,0.112,0.213,0.119,0.119,...,1.0,1.0,0.08,0.112,0.0,1.794,0.0,0.0,1.0,soil
25198,45478,15130,0.206,0.228,0.173,0.282,0.173,0.265,0.16,0.16,...,1.0,1.0,0.135,0.173,66.0,1.657,0.0,0.0,1.0,soil
40876,16623,8959,0.166,0.176,0.13,0.208,0.13,0.203,0.132,0.132,...,1.0,1.0,0.087,0.13,68.0,1.544,0.0,0.0,1.0,soil
24267,40689,13711,0.209,0.232,0.174,0.29,0.174,0.278,0.158,0.158,...,1.0,1.0,0.138,0.174,68.0,1.757,0.213,0.162,1.0,soil
47100,5106,5866,0.159,0.179,0.112,0.221,0.112,0.213,0.119,0.119,...,1.0,1.0,0.08,0.112,0.0,1.794,0.0,0.0,1.0,soil


In [None]:
# print the shape (62890 rows, 49 columns) of train dataset
print(train_data.shape)

(62890, 49)


In [None]:
# preview test dataset
test_data.head()

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd,stone_soil
50656,30066,12106,0.156,0.177,0.119,0.236,0.119,0.208,0.115,0.115,...,1.0,1.0,0.077,0.119,113.333,1.811,0.0,0.0,1.0,soil
8667,3678,5479,0.321,0.415,0.213,0.578,0.213,0.573,0.189,0.189,...,1.0,1.0,0.347,0.213,50.0,3.041,0.451,0.179,1.0,soil
46626,59061,20704,0.159,0.179,0.112,0.221,0.112,0.213,0.119,0.119,...,1.0,1.0,0.08,0.112,0.0,1.794,0.0,0.0,1.0,soil
53320,35039,12826,0.153,0.17,0.112,0.212,0.112,0.199,0.117,0.117,...,1.0,1.0,0.075,0.112,0.0,1.698,0.0,0.0,1.0,soil
28333,1615,4906,0.195,0.22,0.137,0.283,0.137,0.275,0.137,0.137,...,1.0,1.0,0.12,0.137,68.0,2.008,0.243,0.124,1.0,soil


In [None]:
# print the shape (15722 rows, 49 columns) of test dataset
print(test_data.shape)

(15722, 49)


In [None]:
# store data
full_data.drop(full_data.iloc[:,30:37], axis = 1).to_csv('data/cumulative_data.csv', index = False)
train_data.drop(train_data.iloc[:,30:37], axis = 1).to_csv('data/train_data.csv', index = False)
test_data.drop(test_data.iloc[:,30:37], axis = 1).to_csv('data/test_data.csv', index = False)