# Data

In [2]:
# environment documentation

In [3]:
# import packages
import numpy as np
import pandas as pd
import janitor # conda install -c conda-forge pyjanitor
from sklearn.model_selection import train_test_split


## Load and Review

In [4]:
# read data files and clean names

# archaeological soil samples (i.e. dirt)
soil_data = pd.read_csv('data/data_raw/archaeological_soil_data.csv', sep = ',', skiprows=[1]).clean_names()
print('soil',soil_data.shape)

# lithic experimental samples (i.e. stone particles)
stone_data = pd.read_csv('data/data_raw/lithic_experimental_data.csv', sep = ',', skiprows=[1]).clean_names()
print('stone',stone_data.shape)

soil (73313, 48)
stone (5299, 48)


### Soil

In [5]:
# soil review
print(soil_data.shape)
print('duplicates',soil_data.duplicated().sum())
print()
print(soil_data.columns)

(73313, 48)
duplicates 0

Index(['id', 'img_id', 'da', 'dp', 'fwidth', 'flength', 'fthickness',
       'elength', 'ethickness', 'ewidth', 'volume', 'area', 'perimeter',
       'chull_area', 'chull_perimeter', 'sphericity', 'l_t_ratio',
       't_l_aspect_ratio', 'compactness', 'roundness', 'ellipse_ratio',
       'circularity', 'solidity', 'concavity', 'convexity', 'extent', 'hash',
       'transparency', 'curvature', 'surface_area', 'filter0', 'filter1',
       'filter2', 'filter3', 'filter4', 'filter5', 'filter6', 'l_w_ratio',
       'w_l_ratio', 'w_t_ratio', 't_w_ratio', 'chull_surface_area', 'sieve',
       'angularity', 'ellipticity', 'fiber_length', 'fiber_width',
       'krumbein_rnd'],
      dtype='object')


In [6]:
# descriptive stats
soil_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,73313.0,36657.0,21163.78448,1.0,18329.0,36657.0,54985.0,73313.0
img_id,73313.0,14235.545565,6324.077145,2936.0,9551.0,12971.0,17857.0,28051.0
da,73313.0,0.231986,0.376004,0.024,0.152,0.174,0.236,13.303
dp,73313.0,0.284808,0.467939,0.058,0.166,0.197,0.283,16.888
fwidth,73313.0,0.183764,0.372174,0.019,0.112,0.137,0.179,12.966
flength,73313.0,0.36165,0.57519,0.074,0.187,0.243,0.367,17.865
fthickness,73313.0,0.170072,0.292985,0.019,0.112,0.135,0.173,11.317
elength,73313.0,0.360535,0.581504,0.041,0.175,0.232,0.365,16.4
ethickness,73313.0,0.164455,0.283009,0.014,0.106,0.129,0.167,11.309
ewidth,73313.0,0.177981,0.360868,0.014,0.107,0.131,0.173,13.021


In [7]:
# sample
soil_data.head(5).T

Unnamed: 0,0,1,2,3,4
id,25611,48302,32915,22866,10277
img_id,10977,15470,12616,10293,7209
da,13.303,12.578,12.534,12.242,11.012
dp,15.911,16.192,16.888,16.833,13.255
fwidth,12.651,12.966,11.852,12.716,8.301
flength,17.11,16.21,16.679,17.865,15.206
fthickness,11.317,11.119,9.44,10.748,8.301
elength,15.814,14.483,16.4,15.674,15.127
ethickness,11.309,11.091,9.584,10.197,8.187
ewidth,12.542,13.021,11.033,12.019,8.187


In [8]:
# prevalence of 0s
print('prevalence of 0s')
print((soil_data.describe() == 0).sum().where(lambda x: x > 1).dropna())

# prevalence of 1s
print('\nprevalence of 1s')
print((soil_data.describe() == 1).sum().where(lambda x: x > 1).dropna())


prevalence of 0s
hash            7.0
transparency    2.0
curvature       4.0
angularity      2.0
fiber_length    2.0
fiber_width     2.0
dtype: float64

prevalence of 1s
convexity       2.0
w_t_ratio       4.0
t_w_ratio       4.0
krumbein_rnd    4.0
dtype: float64


### Stone

In [9]:
# stone review
print(stone_data.shape)
print('duplicates',stone_data.duplicated().sum())
print()
print(stone_data.columns)

(5299, 48)
duplicates 0

Index(['id', 'img_id', 'da', 'dp', 'fwidth', 'flength', 'fthickness',
       'elength', 'ethickness', 'ewidth', 'volume', 'area', 'perimeter',
       'chull_area', 'chull_perimeter', 'sphericity', 'l_t_ratio',
       't_l_aspect_ratio', 'compactness', 'roundness', 'ellipse_ratio',
       'circularity', 'solidity', 'concavity', 'convexity', 'extent', 'hash',
       'transparency', 'curvature', 'surface_area', 'filter0', 'filter1',
       'filter2', 'filter3', 'filter4', 'filter5', 'filter6', 'l_w_ratio',
       'w_l_ratio', 'w_t_ratio', 't_w_ratio', 'chull_surface_area', 'sieve',
       'angularity', 'ellipticity', 'fiber_length', 'fiber_width',
       'krumbein_rnd'],
      dtype='object')


In [10]:
# descriptive stats
stone_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5299.0,2650.0,1529.833869,1.0,1325.5,2650.0,3974.5,5299.0
img_id,5299.0,16457.526514,1450.276239,2812.0,16376.0,16613.0,16845.0,19030.0
da,5299.0,0.408032,1.629126,0.04,0.161,0.193,0.251,30.893
dp,5299.0,0.533303,2.193727,0.058,0.198,0.25,0.33,38.251
fwidth,5299.0,0.351058,1.685466,0.021,0.107,0.138,0.194,36.878
flength,5299.0,0.731248,3.023436,0.074,0.254,0.339,0.458,57.353
fthickness,5299.0,0.220404,0.661258,0.021,0.102,0.13,0.17,16.445
elength,5299.0,0.70069,2.95037,0.041,0.238,0.315,0.4315,50.536
ethickness,5299.0,0.208487,0.623842,0.015,0.096,0.122,0.163,16.227
ewidth,5299.0,0.336963,1.629089,0.015,0.1,0.13,0.186,34.444


In [11]:
# sample
stone_data.head(5).T

Unnamed: 0,0,1,2,3,4
id,104,19,14,1,83
img_id,10708,5682,4826,2812,9441
da,30.893,27.727,26.726,24.408,22.869
dp,38.251,33.375,36.061,36.198,29.388
fwidth,36.878,35.149,30.199,25.039,24.044
flength,46.822,40.001,46.332,57.353,39.82
fthickness,10.179,9.029,8.025,5.086,4.39
elength,44.168,39.022,47.365,50.536,38.568
ethickness,10.102,7.978,7.753,4.904,3.948
ewidth,34.444,34.108,28.23,23.943,22.535


In [12]:
# prevalence of 0s
print('prevalence of 0s')
print((stone_data.describe() == 0).sum().where(lambda x: x > 1).dropna())

# prevalence of 1s
print('\nprevalence of 1s')
print((stone_data.describe() == 1).sum().where(lambda x: x > 1).dropna())


prevalence of 0s
hash            7.0
curvature       4.0
fiber_length    2.0
fiber_width     2.0
dtype: float64

prevalence of 1s
w_t_ratio       4.0
t_w_ratio       4.0
krumbein_rnd    4.0
dtype: float64


## Missingness

In [13]:
# explicit NAs
print(soil_data.isna().sum().sum())
print(stone_data.isna().sum().sum())

0
0


In [14]:
# sort by krumbein to confirm that only about 7K rows are not equal to 1
soil_data.sort_values(by=['krumbein_rnd']).head(7500)

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_l_ratio,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd
3,22866,10293,12.242,16.833,12.716,17.865,10.748,15.674,10.197,12.019,...,0.712,1.183,0.845,526.194,11.732,45.047,1.537,29.682,3.588,0.168
1,48302,15470,12.578,16.192,12.966,16.210,11.119,14.483,11.091,13.021,...,0.800,1.166,0.858,550.880,12.042,37.366,1.306,24.508,5.025,0.178
0,25611,10977,13.303,15.911,12.651,17.110,11.317,15.814,11.309,12.542,...,0.739,1.118,0.895,599.801,11.984,28.000,1.398,17.259,8.683,0.192
5,26142,11069,10.427,12.246,10.584,12.633,9.223,12.157,9.038,10.237,...,0.838,1.148,0.871,364.671,9.904,31.866,1.345,13.949,6.765,0.204
2,32915,12616,12.534,16.888,11.852,16.679,9.440,16.400,9.584,11.033,...,0.711,1.255,0.797,563.687,10.646,33.278,1.711,23.274,4.844,0.209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49237,12530,8124,0.158,0.167,0.129,0.190,0.129,0.188,0.130,0.130,...,0.682,1.000,1.000,0.079,0.129,87.500,1.448,0.000,0.000,1.000
49527,24470,10632,0.157,0.181,0.110,0.234,0.110,0.227,0.105,0.105,...,0.472,1.000,1.000,0.080,0.110,68.000,2.157,0.183,0.106,1.000
50259,39317,13426,0.156,0.169,0.121,0.201,0.121,0.196,0.120,0.120,...,0.603,1.000,1.000,0.078,0.121,87.500,1.632,0.146,0.131,1.000
51330,38436,13260,0.155,0.170,0.114,0.218,0.114,0.205,0.113,0.113,...,0.524,1.000,1.000,0.075,0.114,68.000,1.814,0.000,0.000,1.000


In [15]:
# check the object columns for missing values. all are reject for all rows
print((stone_data['filter0'] == 'Reject').sum())
print((stone_data['filter1'] == 'Reject').sum())
print((stone_data['filter2'] == 'Reject').sum())
print((stone_data['filter3'] == 'Reject').sum())
print((stone_data['filter4'] == 'Reject').sum())
print((stone_data['filter5'] == 'Reject').sum())
print((stone_data['filter6'] == 'Reject').sum())

5299
5299
5299
5299
5299
5299
5299


In [16]:
# hash is all 0's
print(sum(soil_data['hash'] == 0)/len(soil_data))
print(sum(stone_data['hash'] == 0)/len(stone_data))

1.0
1.0
