## 10-load-data

> In this file, we load the two datasets and standardize the variable names. The first dataset consists of soil samples while the second dataset consists of the lithic (stone) microdebitage samples. We will be training our model to detect microdebitage particles that are mixed within a much larger number of soil particles.

In [None]:
# import packages
import pandas as pd
import janitor
import numpy as np
from sklearn.model_selection import train_test_split
# must install janitor package with the following shell command:
# 'conda install -c conda-forge pyjanitor'

### Loading Data and Cleaning Column Names

In [None]:
# read data files and clean names

# archaeological soil samples
# this is the set of example soil particles
soil_data = pd.read_csv('data_raw/archaeological_soil_data.csv', sep = ',', skiprows=[1]).clean_names()

# lithic experimental samples
# this is the set of example stone particles
stone_data = pd.read_csv('data_raw/lithic_experimental_data.csv', sep = ',', skiprows=[1]).clean_names()

In [None]:
# soil shape
print(soil_data.shape) # 73,314 rows * 48 columns

# soil dtypes
print(soil_data.dtypes)

(73313, 48)
id                      int64
img_id                  int64
da                    float64
dp                    float64
fwidth                float64
flength               float64
fthickness            float64
elength               float64
ethickness            float64
ewidth                float64
volume                float64
area                  float64
perimeter             float64
chull_area            float64
chull_perimeter       float64
sphericity            float64
l_t_ratio             float64
t_l_aspect_ratio      float64
compactness           float64
roundness             float64
ellipse_ratio         float64
circularity           float64
solidity              float64
concavity             float64
convexity             float64
extent                float64
hash                    int64
transparency          float64
curvature             float64
surface_area          float64
filter0                object
filter1                object
filter2                objec

In [None]:
# soil preview
soil_data

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_l_ratio,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd
0,25611,10977,13.303,15.911,12.651,17.110,11.317,15.814,11.309,12.542,...,0.739,1.118,0.895,599.801,11.984,28.000,1.398,17.259,8.683,0.192
1,48302,15470,12.578,16.192,12.966,16.210,11.119,14.483,11.091,13.021,...,0.800,1.166,0.858,550.880,12.042,37.366,1.306,24.508,5.025,0.178
2,32915,12616,12.534,16.888,11.852,16.679,9.440,16.400,9.584,11.033,...,0.711,1.255,0.797,563.687,10.646,33.278,1.711,23.274,4.844,0.209
3,22866,10293,12.242,16.833,12.716,17.865,10.748,15.674,10.197,12.019,...,0.712,1.183,0.845,526.194,11.732,45.047,1.537,29.682,3.588,0.168
4,10277,7209,11.012,13.255,8.301,15.206,8.301,15.127,8.187,8.187,...,0.546,1.000,1.000,406.845,8.301,25.041,1.848,13.992,6.806,0.265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73308,56682,19419,0.037,0.089,0.019,0.135,0.019,0.044,0.016,0.016,...,0.145,1.000,1.000,0.006,0.019,0.000,2.705,0.000,0.000,1.000
73309,71921,26912,0.035,0.064,0.036,0.080,0.036,0.077,0.015,0.015,...,0.447,1.000,1.000,0.007,0.036,85.000,5.253,0.000,0.000,1.000
73310,64014,24046,0.035,0.073,0.039,0.087,0.039,0.092,0.018,0.018,...,0.448,1.000,1.000,0.009,0.039,0.000,4.990,0.000,0.000,1.000
73311,67033,25158,0.029,0.075,0.040,0.084,0.040,0.095,0.024,0.024,...,0.481,1.000,1.000,0.008,0.040,113.333,3.906,0.000,0.000,1.000


In [None]:
# stone shape
print(stone_data.shape) # 5,300 rows * 48 columns

# stone dtypes
print(stone_data.dtypes)

(5299, 48)
id                      int64
img_id                  int64
da                    float64
dp                    float64
fwidth                float64
flength               float64
fthickness            float64
elength               float64
ethickness            float64
ewidth                float64
volume                float64
area                  float64
perimeter             float64
chull_area            float64
chull_perimeter       float64
sphericity            float64
l_t_ratio             float64
t_l_aspect_ratio      float64
compactness           float64
roundness             float64
ellipse_ratio         float64
circularity           float64
solidity              float64
concavity             float64
convexity             float64
extent                float64
hash                    int64
transparency          float64
curvature             float64
surface_area          float64
filter0                object
filter1                object
filter2                object

In [None]:
# stone preview
stone_data

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_l_ratio,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd
0,104,10708,30.893,38.251,36.878,46.822,10.179,44.168,10.102,34.444,...,0.788,3.623,0.276,3211.655,23.529,19.392,4.372,49.768,22.823,0.214
1,19,5682,27.727,33.375,35.149,40.001,9.029,39.022,7.978,34.108,...,0.879,3.893,0.257,2547.278,22.089,20.688,4.891,37.445,5.128,0.214
2,14,4826,26.726,36.061,30.199,46.332,8.025,47.365,7.753,28.230,...,0.652,3.763,0.266,2571.497,19.112,21.250,6.110,52.384,17.379,0.219
3,1,2812,24.408,36.198,25.039,57.353,5.086,50.536,4.904,23.943,...,0.437,4.923,0.203,2001.165,15.063,18.233,10.304,51.467,17.096,0.251
4,83,9441,22.869,29.388,24.044,39.820,4.390,38.568,3.948,22.535,...,0.604,5.477,0.183,1745.237,14.217,17.086,9.769,36.426,3.135,0.276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5294,4469,16993,0.040,0.118,0.071,0.141,0.071,0.041,0.024,0.024,...,0.500,1.000,1.000,0.020,0.071,120.000,1.732,0.178,0.007,1.000
5295,17,4867,0.040,0.086,0.045,0.112,0.045,0.041,0.024,0.024,...,0.400,1.000,1.000,0.010,0.045,110.000,1.732,0.125,0.010,1.000
5296,5194,18045,0.040,0.099,0.035,0.141,0.035,0.041,0.024,0.024,...,0.250,1.000,1.000,0.010,0.035,87.500,1.732,0.147,0.008,1.000
5297,5196,18046,0.040,0.099,0.035,0.141,0.035,0.041,0.024,0.024,...,0.250,1.000,1.000,0.010,0.035,116.667,1.732,0.147,0.008,1.000
