# Compose and Split

In [10]:
# import packages
import numpy as np
import pandas as pd
import janitor # conda install -c conda-forge pyjanitor
from sklearn.model_selection import train_test_split

In [11]:
# read data files and clean names

# archaeological soil samples (i.e. dirt)
soil_data = pd.read_csv('data/data_raw/archaeological_soil_data.csv', sep = ',', skiprows=[1]).clean_names()
print('soil',soil_data.shape)

# lithic experimental samples (i.e. stone particles)
stone_data = pd.read_csv('data/data_raw/lithic_experimental_data.csv', sep = ',', skiprows=[1]).clean_names()
print('stone',stone_data.shape)

soil (73313, 48)
stone (5299, 48)


In [12]:
# add indicator column
soil_data['stone_soil'] = 'soil'
stone_data['stone_soil'] = 'stone'

In [13]:
# confirm same columns in both sets
soil_data.columns == stone_data.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [14]:
# combine
df = pd.concat([soil_data, stone_data]).reset_index(drop=True)

In [15]:
# confirm
print(df.shape)
print(df.columns)

(78612, 49)
Index(['id', 'img_id', 'da', 'dp', 'fwidth', 'flength', 'fthickness',
       'elength', 'ethickness', 'ewidth', 'volume', 'area', 'perimeter',
       'chull_area', 'chull_perimeter', 'sphericity', 'l_t_ratio',
       't_l_aspect_ratio', 'compactness', 'roundness', 'ellipse_ratio',
       'circularity', 'solidity', 'concavity', 'convexity', 'extent', 'hash',
       'transparency', 'curvature', 'surface_area', 'filter0', 'filter1',
       'filter2', 'filter3', 'filter4', 'filter5', 'filter6', 'l_w_ratio',
       'w_l_ratio', 'w_t_ratio', 't_w_ratio', 'chull_surface_area', 'sieve',
       'angularity', 'ellipticity', 'fiber_length', 'fiber_width',
       'krumbein_rnd', 'stone_soil'],
      dtype='object')


In [19]:
# drop columns with no meaningful data
df.drop(columns=['filter0', 'filter1','filter2', 'filter3', 'filter4', 'filter5', 'filter6'], inplace=True)

In [20]:
# train/test split
train_data, test_data = train_test_split(df, train_size = 0.8, random_state = 1)

In [21]:
print(train_data.shape)
print(test_data.shape)

(62889, 42)
(15723, 42)


In [23]:
# store data
df.to_csv('data/cumulative_data.csv', index = False)
train_data.to_csv('data/train_data.csv', index = False)
test_data.to_csv('data/test_data.csv', index = False)