## 13-compose-full-dataset

In [None]:
# import packages
import pandas as pd
import janitor
import numpy as np

In [None]:
# read data files and clean names

# archaeological soil samples
# this is the set of example soil particles
soil_data = pd.read_csv('data_raw/archaeological_soil_data.csv', sep = ',', skiprows=[1]).clean_names()

# lithic experimental samples
# this is the set of example stone particles
stone_data = pd.read_csv('data_raw/lithic_experimental_data.csv', sep = ',', skiprows=[1]).clean_names()

In [None]:
# Add a column to each dataframe that indicates whether the rows are from the stone or soil csv
soil_data['stone_soil'] = 'soil'
stone_data['stone_soil'] = 'stone'

In [None]:
# Confirm that all the columns are the the same before combining the dataset
soil_data.columns == stone_data.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [None]:
# combine the rows of both datasets to one big dataset called stone_soil_data
stone_soil_data = pd.concat([soil_data, stone_data])

In [None]:
# print the shape and the full dataset to confirm it meets expectations
# shape is 49 columns which are the original 48 columns plus the soil or stone designation column
# rows are 78612 which is the combined rows of the two datasets
print(stone_soil_data.shape)
print(stone_soil_data)

(78612, 49)
         id  img_id      da      dp  fwidth  flength  fthickness  elength  \
0     25611   10977  13.303  15.911  12.651   17.110      11.317   15.814   
1     48302   15470  12.578  16.192  12.966   16.210      11.119   14.483   
2     32915   12616  12.534  16.888  11.852   16.679       9.440   16.400   
3     22866   10293  12.242  16.833  12.716   17.865      10.748   15.674   
4     10277    7209  11.012  13.255   8.301   15.206       8.301   15.127   
...     ...     ...     ...     ...     ...      ...         ...      ...   
5294   4469   16993   0.040   0.118   0.071    0.141       0.071    0.041   
5295     17    4867   0.040   0.086   0.045    0.112       0.045    0.041   
5296   5194   18045   0.040   0.099   0.035    0.141       0.035    0.041   
5297   5196   18046   0.040   0.099   0.035    0.141       0.035    0.041   
5298   4251   16913   0.040   0.086   0.045    0.112       0.045    0.041   

      ethickness  ewidth  ...  w_t_ratio  t_w_ratio  chull_surf

In [None]:
# fix indices
stone_soil_data=stone_soil_data.reset_index(drop=True)

In [None]:
# create a function that performs the tasks that we have just done in steps
# takes the file locations of our two csvs and returns the full dataframe
def combine_data(soil_csv_location, stone_csv_location):
    soil_data = pd.read_csv(soil_csv_location, sep = ',', skiprows=[1]).clean_names()
    stone_data = pd.read_csv(stone_csv_location, sep = ',', skiprows=[1]).clean_names()
    soil_data['stone_soil'] = 'soil'
    stone_data['stone_soil'] = 'stone'
    stone_soil_data = pd.concat([soil_data, stone_data])
    stone_soil_data=stone_soil_data.reset_index(drop=True)
    return stone_soil_data

In [None]:
# test function with the location of the two files that we have
# confirmed that it gives the full dataframe that we expect
combine_data('data_raw/archaeological_soil_data.csv', 'data_raw/lithic_experimental_data.csv')

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,w_t_ratio,t_w_ratio,chull_surface_area,sieve,angularity,ellipticity,fiber_length,fiber_width,krumbein_rnd,stone_soil
0,25611,10977,13.303,15.911,12.651,17.110,11.317,15.814,11.309,12.542,...,1.118,0.895,599.801,11.984,28.000,1.398,17.259,8.683,0.192,soil
1,48302,15470,12.578,16.192,12.966,16.210,11.119,14.483,11.091,13.021,...,1.166,0.858,550.880,12.042,37.366,1.306,24.508,5.025,0.178,soil
2,32915,12616,12.534,16.888,11.852,16.679,9.440,16.400,9.584,11.033,...,1.255,0.797,563.687,10.646,33.278,1.711,23.274,4.844,0.209,soil
3,22866,10293,12.242,16.833,12.716,17.865,10.748,15.674,10.197,12.019,...,1.183,0.845,526.194,11.732,45.047,1.537,29.682,3.588,0.168,soil
4,10277,7209,11.012,13.255,8.301,15.206,8.301,15.127,8.187,8.187,...,1.000,1.000,406.845,8.301,25.041,1.848,13.992,6.806,0.265,soil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78607,4469,16993,0.040,0.118,0.071,0.141,0.071,0.041,0.024,0.024,...,1.000,1.000,0.020,0.071,120.000,1.732,0.178,0.007,1.000,stone
78608,17,4867,0.040,0.086,0.045,0.112,0.045,0.041,0.024,0.024,...,1.000,1.000,0.010,0.045,110.000,1.732,0.125,0.010,1.000,stone
78609,5194,18045,0.040,0.099,0.035,0.141,0.035,0.041,0.024,0.024,...,1.000,1.000,0.010,0.035,87.500,1.732,0.147,0.008,1.000,stone
78610,5196,18046,0.040,0.099,0.035,0.141,0.035,0.041,0.024,0.024,...,1.000,1.000,0.010,0.035,116.667,1.732,0.147,0.008,1.000,stone
