#  exploreSA-Gawler: Data Preparation for the SARIG Data Package

  - Group: __TriPandas__
  - Members: __Hugh Ouyang__, __Liang Chen__, __Wei Liu__

This code set includes four files: "1. RS_Data_Processing.ipynb", "2. Drillhole_Data_Processing.ipynb", "3. Field_Observation_Data_Processing.ipynb" and "4. MD_Data_Processing.ipynb". These ipynb files are supposed to execute in the order suggested in the file names. 

It will automatically download the dataset from https://unearthed-exploresa.s3-ap-southeast-2.amazonaws.com/Unearthed_5_SARIG_Data_Package.zip and  save the precessed files to the directory under the name of the selected elements. 

A sample of the resulting dataset is shared at https://drive.google.com/drive/folders/18xBsRCAXP0cGsYl8L9PrtHkgy62cGRPL?usp=sharing

If you have any question or comments, please comment it at the forum. We will update this code gradually as our data preparation and data engineering move on. 

## RS data processing

### load data
This section imports the required packages, download the dataset from the website and list the file names. As below, the data_loc is where the data set will be saved. To save storage space, the dataset won't be extracted as a folder. The data procesing will be done based via reading corresponding file contained in the zip file.  

In [None]:
# import required packages
import pandas as pd
import os
import pickle
import sys
import ipywidgets as widgets
from IPython.display import display
pd.options.display.width=None
pd.options.display.max_columns=None


if sys.version_info >= (3, 6):
    from zipfile import ZipFile as zipfile
else:
    import zipfile36 as zipfile
    
url = "https://unearthed-exploresa.s3-ap-southeast-2.amazonaws.com/Unearthed_5_SARIG_Data_Package.zip" 
# enter the directory to save data
data_loc = 'D:/GitFolder/WorkBench/exploreSA-Gawler/data'
file_name = 'Unearthed_5_SARIG_Data_Package.zip'

if os.path.isfile(os.path.join(data_loc, file_name)):
    print ("File exists, No need to download.")
    pass
else:
    # open and save the zip file onto computer
    url = urlopen(URL)
    output = open('Unearthed_5_SARIG_Data_Package.zip', 'wb')    # note the flag:  "wb"        
    output.write(url.read())
    output.close()
    
files_in_dataset = []
file_name = 'Unearthed_5_SARIG_Data_Package.zip'
for file in zipfile(os.path.join(data_loc, file_name),'r').filelist:
    files_in_dataset.append(file.filename)
    
files_in_dataset



 For this part of data cleaning, we will only use the following files: 
 - 'SARIG_Data_Package/sarig_rs_biostr_analys_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_biostr_results_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_chem_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_chem_isotope_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_details_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_geochron_ages_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_geochron_reslt_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_petrology_exp.csv',
 - 'SARIG_Data_Package/sarig_rs_reference_exp.csv'

### record identifier


In [None]:
sarig_rs_reference_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_reference_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_reference_exp['SAMREF_CNO'] = sarig_rs_reference_exp['SAMREF_CNO'].astype('Int64')
sarig_rs_reference_exp['PUBLICATION_DATE'] = pd.to_datetime(sarig_rs_reference_exp['PUBLICATION_DATE'])
sarig_rs_reference_exp.sample(5)

In [None]:
sarig_rs_reference_exp.info()

The SAMPLE_NO and the SITE_NO both have no null and they have a multiple to one relationship, indicating that for one site,  there were one or more samples for experiments. The combination of SITE_NO and SAMPLE_NO can be the record identifier. 

In [None]:
del sarig_rs_reference_exp

### Select chemical code of interest

At first we need to standardize the unit in the dataset as the UNIT contains "%", "ppb" and other units.
The column VALUE also contains different cases: numeric value, < number, > number. We cast <number as 0 and > number as number only. 

In [None]:
# define two functions to handle the cases of VALUE and UNIT
def change_type(x):
    '''handle three cases of VALUE, assigning it as 0 when it <d, and keep the value d when it >d'''
    x = str(x)
    if '<' in x:
        return '0'
    elif '>' in x:
        return x.split('>')[1]
    else:
        return x
    

def change_units(x):
    '''this function is to change the value and units. '''
    if x[1] == "%":
        x[0] = x[0]/10000
        x[1] = 'ppm'
    elif x[1] == 'ppb':
        x[0] = x[0]*1000
        x[1] = 'ppm'
    elif x[1] == 'ug/l':
        x[0] = x[0]*1000
        x[1] = 'ppm'
    else:
        x[1] = 'ppm'
    return x
        

In [None]:
# read the sarig_rs_chem_exp by chunks to extract the set of CHEM_CODE
sarig_rs_chem_exp_chunks = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_chem_exp.csv','r'), 
    sep=',', encoding='latin1', chunksize=100000)

chem_code_set = set()
for chunk in sarig_rs_chem_exp_chunks:
#     chunk['COLLECTED_DATE'] = pd.to_datetime(chunk['COLLECTED_DATE'])
#     chunk['VALUE'] = chunk['VALUE'].apply(lambda x: change_type(x))
#     chunk['VALUE'] = chunk['VALUE'].astype('float32')
#     chunk.loc[:, ['VALUE', 'UNIT']] = chunk.loc[:, ['VALUE', 'UNIT']].apply(lambda x: change_units(x), axis=1)
    chem_code_set.update(set(chunk['CHEM_CODE'].unique()))

In [None]:
# see some samples of the records
chunk.sample(1000)

In [None]:
chem_elements = {}
for item in sorted(list(chem_code_set)):
    chem_elements[item] = [item]
    
chem_elements['Al'].extend(chem_elements['Al2O3'])
chem_elements['Ba'].extend(chem_elements['BaO'])
chem_elements['C'].extend(chem_elements['CO2'])
chem_elements['C'].extend(chem_elements['CO3'])
chem_elements['Ca'].extend(chem_elements['CaCO3'])
chem_elements['Ca'].extend(chem_elements['CaO'])
chem_elements['Ca'].extend(chem_elements['CaSO4'])
chem_elements['Co'].extend(chem_elements['CoO'])
chem_elements['Cr'].extend(chem_elements['Cr2O3'])
chem_elements['Fe'].extend(chem_elements['Fe2'])
chem_elements['Fe'].extend(chem_elements['Fe2O3'])
chem_elements['Fe'].extend(chem_elements['FeO'])
chem_elements['Fe'].extend(chem_elements['FeS2'])
chem_elements['H2O'].extend(chem_elements['H2O_minus'])
chem_elements['H2O'].extend(chem_elements['H2O_plus'])
chem_elements['Mg'].extend(chem_elements['MgCO3'])
chem_elements['Mg'].extend(chem_elements['MgO'])
chem_elements['Mn'].extend(chem_elements['MnO'])
chem_elements['Na'].extend(chem_elements['Na2O'])
chem_elements['Na'].extend(chem_elements['NaCl'])
chem_elements['Nb'].extend(chem_elements['Nb2O5'])
chem_elements['Ni'].extend(chem_elements['NiO'])
chem_elements['P'].extend(chem_elements['P2O5'])
chem_elements['S'].extend(chem_elements['SO3'])
chem_elements['S'].extend(chem_elements['SO4'])
chem_elements['Si'].extend(chem_elements['SiO2'])
chem_elements['Sr'].extend(chem_elements['Sr87_86'])
chem_elements['Sr'].extend(chem_elements['SrO'])
chem_elements['Ta'].extend(chem_elements['Ta2O5'])
chem_elements['Th'].extend(chem_elements['ThO2'])
chem_elements['Ti'].extend(chem_elements['TiO2'])
chem_elements['U'].extend(chem_elements['U3O8'])
chem_elements['V'].extend(chem_elements['V2O3'])
chem_elements['V'].extend(chem_elements['V2O5'])
chem_elements['W'].extend(chem_elements['WO3'])
chem_elements['Zn'].extend(chem_elements['ZnO'])
chem_elements['Zr'].extend(chem_elements['ZrO2'])

modified = [ 'Al2O3', 'BaO', 'CO2','CO3',
 'CaCO3',  'CaO', 'CaSO4', 'CoO', 'Cr2O3', 'Fe2', 'Fe2O3', 'Fe3', 'FeO', 'FeS2',
 'H2O_minus', 'H2O_plus', 'K2O', 'MgCO3', 'MgO', 'MnO', 'NO3', 'Na2O', 'NaCl',
 'Nb2O5', 'NiO', 'P2O5', 'SO3', 'SO4', 'SiO2', 'Sr87_86', 'SrO', 'Ta2O5', 'ThO2',
 'TiO2', 'U3O8', 'V2O3', 'V2O5', 'WO3', 'ZnO', 'ZrO2', 'Total']
for key in modified:
    del chem_elements[key]
    
print(chem_elements.keys())

Using print(*sorted(list(chem_code_set)), sep = ", ") to print the set of the CHEM_CODE as 

[Ag, Al, Al2O3, As, Au, B, Ba, BaO, Be, Bi, Br, C, CO2, CO3, CPS_gamma, Ca, CaCO3, CaO, CaSO4, Cd, Ce, Cl, Co, CoO, Cr, Cr2O3, Cs, Cu, Dy, EC, Er, Eu, F, Fe, Fe2, Fe2O3, Fe3, FeO, FeS2, GPSM, Ga, Gd, Ge, GoI, H2O, H2O_minus, H2O_plus, HCO3, HMIN, Hf, Hg, Ho, I, In, Insol, Ir, K, K2O, LOI, La, Li, Lu, Mg, MgCO3, MgO, Mn, MnO, Mo, NO3, Na, Na2O, NaCl, Nb, Nb2O5, Nd, Ni, NiO, O18, Os, P, P2O5, Pb, Pd, Pr, Pt, RADBK, RADTC, Rb, Re, Rh, Ru, S, SO3, SO4, Sb, Sc, Se, Si, SiO2, Sm, Sn, Sr, Sr87_86, SrO, TOC, TOT/C, TOT/S, Ta, Ta2O5, Tb, Te, Th, ThO2, Ti, TiO2, Tl, Tm, Total, U, U3O8, V, V2O3, V2O5, W, WO3, Y, Yb, Zn, ZnO, Zr, ZrO2, pH]

Oxides were converted and merged into their relevant individual elemental file. So do some other compounds. 

Then we can select the chemicals from the list below to extract data.
['Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'CPS_gamma', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'EC', 'Er', 'Eu', 'F', 'Fe', 'GPSM', 'Ga', 'Gd', 'Ge', 'GoI', 'H2O', 'HCO3', 'HMIN', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Insol', 'Ir', 'K', 'LOI', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'Na', 'Nb', 'Nd', 'Ni', 'O18', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'RADBK', 'RADTC', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'TOC', 'TOT/C', 'TOT/S', 'Ta', 'Tb', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr', 'pH']

Also, we are interested in some features only, so some features are ignored. 

__Select the element from the dropdown list below for which you want to extract data. Then, "Run All Below" at the tab "Cell".__ 

In [None]:
dropdown_element = widgets.Dropdown(options= list(chem_elements.keys()))
output_element = widgets.Output()
def dropdown_element_eventhandler(change):
    output_element.clear_output()
    with output_element: 
        display(change.new)
    

dropdown_element.observe(dropdown_element_eventhandler, names='value')
display(dropdown_element)


In [None]:
element_selected = output_element.outputs[0]['data']['text/plain'].replace("'", '')
element_selected

In [None]:
# select the interested chemical code

interested_chemicals = chem_elements[element_selected]

# columns carrrying useful info
selected_chem_exp_cols = ['SAMPLE_NO', 'SAMPLE_SOURCE_CODE','ROCK_GROUP_CODE', 
                 'LITHO_CODE', 'MAP_SYMBOL', 'DRILLHOLE_NUMBER', 
                 'DH_DEPTH_FROM', 'DH_DEPTH_TO', 'SITE_NO', 'EASTING_GDA2020',
                 'NORTHING_GDA2020', 'ZONE_GDA2020', 'LONGITUDE_GDA2020', 
                 'LATITUDE_GDA2020', 'LONGITUDE_GDA94', 'LATITUDE_GDA94', 
                 'SAMPLE_ANALYSIS_NO', 'CHEM_CODE', 'VALUE', 'UNIT', 
                 'CHEM_METHOD_CODE']

# define an empty dataframe to contain the data related
interested_rs_chem_exp = pd.DataFrame(columns = selected_chem_exp_cols)
interested_rs_chem_exp.head()

In [None]:
interested_chemicals

As the file 'sarig_rs_chem_exp.csv' is a large file, it cannot be loaded into the memory for most personal computer unless the computer has a RAM more than 16GB.  Here, we read chunk by chunk and map the functions to chunks specifically so as to reduce the times to read this file. 

In [None]:
# read the sarig_rs_chem_exp by chunks to extract the set of CHEM_CODE
sarig_rs_chem_exp_chunks = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name), 'r').open('SARIG_Data_Package/sarig_rs_chem_exp.csv', 'r'), 
    sep=',', encoding='latin1', chunksize=100000)

for chunk in sarig_rs_chem_exp_chunks:    
#     try:
    chunk_selected = chunk[chunk['CHEM_CODE'].isin(interested_chemicals)][selected_chem_exp_cols]
#    chunk_selected['COLLECTED_DATE'] = pd.to_datetime(chunk_selected['COLLECTED_DATE'])
    chunk_selected['VALUE'] = chunk_selected['VALUE'].apply(lambda x: change_type(x))
    chunk_selected['VALUE'] = chunk_selected['VALUE'].astype('float32')
    chunk_selected.loc[:, ['VALUE', 'UNIT']] = chunk_selected.loc[:, ['VALUE', 'UNIT']].apply(lambda x: change_units(x), axis=1)
    interested_rs_chem_exp = interested_rs_chem_exp.append(chunk_selected)
#     except:
#         print('one chunk does not contain the chemicals.\n')
#         pass

interested_rs_chem_exp.head()

__Export the SITE_NO and SAMPLE_NO for the convenience to select the data from other group of datasets.__

In [None]:
selected_rs_chem_site_sample_num = interested_rs_chem_exp[['SITE_NO', 'SAMPLE_NO']].drop_duplicates()
element_path = './data/{}'.format(element_selected)
os.mkdir(element_path) 
selected_rs_chem_site_sample_num.to_csv('./data/{}/rs_chem_site_sample_num.csv'.format(element_selected), sep=',', header='infer')

Now, we can obtain the list of SITE_NO and SAMPLE_NO corresponding to the records of selected CHEM_CODE

### Load other data table and obtain the records corresponding to the CHEM_CODE selected.

In [None]:
sarig_rs_biostr_analys_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_biostr_analys_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_biostr_analys_exp['DRILLHOLE_NO'] = sarig_rs_biostr_analys_exp['DRILLHOLE_NO'].astype('Int64')
sarig_rs_biostr_analys_exp['ANALYSIS_DATE'] = pd.to_datetime(sarig_rs_biostr_analys_exp['ANALYSIS_DATE']) 
sarig_rs_biostr_analys_exp.sample(5)

In [None]:
sarig_rs_biostr_analys_exp.info()

This table contains many nulls and these columns do not contain much useful information for our prediction, so discard some columns. 

In [None]:
rs_biostr_analys_exp_cols = ['SAMPLE_NO', 'SAMPLE_ANALYSIS_NO', 'OTHER_SAMPLE_NO', 'DRILLHOLE_NO',
       'DRILLHOLE_NAME', 'DEPTH_FROM', 'DEPTH_TO', 'SAMPLE_SOURCE_CODE',
       'MAP_SYMBOL', 'LITHOLOGY_CODE', 'YOUNGEST_AGE',  'OLDEST_AGE', 'PALEO_ENVIRONMENT', 
       'SITE_NO', 'SVY_METHOD_CODE']

interested_rs_biostr_analys_exp = sarig_rs_biostr_analys_exp[rs_biostr_analys_exp_cols]
interested_rs_biostr_analys_exp.sample(5)

__Merge with the biostratigraphic analysis data corresponding to the selected CHEM_CODE.__

In [None]:
extracted_rs_data = interested_rs_chem_exp.merge(
    interested_rs_biostr_analys_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'],
    suffixes=('_chem', '_biostr'))
del interested_rs_biostr_analys_exp
extracted_rs_data.sample(5)


### Load biostratigraphic results

In [None]:
sarig_rs_biostr_results_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_biostr_results_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_biostr_results_exp.sample(5)

In [None]:
sarig_rs_biostr_results_exp.info()

In [None]:
sarig_rs_biostr_results_exp.columns

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    sarig_rs_biostr_results_exp[['SAMPLE_NO', 'SAMPLE_ANALYSIS_NO', 
                                 'ANALYSIS_TYPE', 'BIOSTRAT_RESULT',
                                 'BIOSTRAT_RESULT_TYPE', 'TYPE_SPECIMEN', 
                                 'FIGURED_SPECIMEN', 'COMMENTS', 'SITE_NO']], 
    how='left', on=['SITE_NO', 'SAMPLE_NO'], suffixes=('', '_result'))
extracted_rs_data['DRILLHOLE_NO'] = extracted_rs_data['DRILLHOLE_NO'].astype('Int64')
del sarig_rs_biostr_results_exp
extracted_rs_data.sample(5)


In [None]:
extracted_rs_data['UNIT'].value_counts()

### Load the Chemical isotope data

In [None]:
sarig_rs_chem_isotope_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_chem_isotope_exp.csv','r'), 
    sep=',', encoding='utf-8')
sarig_rs_chem_isotope_exp['ANALYSIS_DATE'] = pd.to_datetime(sarig_rs_chem_isotope_exp['ANALYSIS_DATE'])
sarig_rs_chem_isotope_exp.sample(5)

Here, the 'UNIT' is not unique, so the 'VALUE' should be adjusted when we unified the UNIT. Further study is needed to unify the units if possible, otherwise, these two columns will be split. 

In [None]:
sarig_rs_chem_isotope_exp.info()

In [None]:
rs_chem_isotope_exp_cols = [
    'SAMPLE_NO', 'SAMPLE_ANALYSIS_NO', 'OTHER_ANALYSIS_ID', 'ANALYSIS_TYPE',
    'SPECIMEN_TYPE', 'MIN_SEPARATE_MINERAL', 'STRATIGRAPHIC_UNIT', 'MAP_SYMBOL',
    'ROCK_GROUP', 'LITHOLOGY', 'ANALYSIS_POINT_NO', 'ANALYSIS_POINT_ID',
    'POINT_MINERAL', 'ANALYTE', 'VALUE', 'UNIT', 'UNCERTAINTY_VALUE',
    'UNCERTAINTY_UNIT', 'ANALYSIS_METHOD_CODE', 'SITE_NO']
interested_rs_chem_isotope_exp = sarig_rs_chem_isotope_exp[rs_chem_isotope_exp_cols]
interested_rs_chem_isotope_exp.sample(5)

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    interested_rs_chem_isotope_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'], 
    suffixes=('', '_isotope'))
del interested_rs_chem_isotope_exp
extracted_rs_data.head()


### load the rs_details data

In [None]:
sarig_rs_details_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_details_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_details_exp['COLLECTED_DATE'] = pd.to_datetime(sarig_rs_details_exp['COLLECTED_DATE'])
sarig_rs_details_exp['DRILLHOLE_NUMBER'] = sarig_rs_details_exp['DRILLHOLE_NUMBER'].astype('Int64')
sarig_rs_details_exp.sample(5)

In [None]:
sarig_rs_details_exp.info()

In [None]:
rs_details_exp_cols = ['SAMPLE_NO', 'SAMPLE_SOURCE_CODE', 'ROCK_GROUP_CODE',
       'ROCK_GROUP', 'LITHO_CODE', 'LITHO_MODIFIER', 'MAP_SYMBOL', 
       'PARENT_SAMPLE_NO', 'RS_NUMBER', 'DRILLHOLE_NUMBER', 'DH_UNIT_NO', 
        'DH_DEPTH_FROM', 'DH_DEPTH_TO', 'GEOCHEMISTRY', 'PETROLOGY', 
        'BIOSTRATIGRAPHY', 'IMAGE', 'MAP_250000', 'MAP_100000', 'MAP_50000',
       'SITE_NO', 'SURVEY_METHOD_CODE']
interested_rs_details_exp = sarig_rs_details_exp[rs_details_exp_cols]
interested_rs_details_exp.sample(5)

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    interested_rs_details_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'], 
    suffixes=('', '_details'))
del interested_rs_details_exp
extracted_rs_data.head()

### load the geochronical age data

In [None]:
sarig_rs_geochron_ages_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_geochron_ages_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_geochron_ages_exp['INTERPRETATION_DATE'] = pd.to_datetime(sarig_rs_geochron_ages_exp['INTERPRETATION_DATE'])
sarig_rs_geochron_ages_exp.sample(5)

In [None]:
sarig_rs_geochron_ages_exp.info()

In [None]:
rs_geochron_ages_exp_cols = ['SAMPLE_NO', 'INTERPRETATION_NO', 
    'INTERPRETATION_GROUP_NO', 'DATING_METHOD', 'AGE', 'AGE_UNIT', 
    'AGE_ERROR_MIN', 'AGE_ERROR_MAX', 'AGE_ERROR_UNIT', 
    'GEOLOGICAL_ATTRIBUTION', 'AGE_TYPE', 'MSWD', 'PROBABILITY_OF_FIT',
    'N_OF_ANALYSIS', 'STRATIGRAPHIC_UNIT', 'MAP_SYMBOL', 'ROCK_GROUP', 
    'LITHOLOGY', 'SITE_NO']
interested_rs_geochron_ages_exp = sarig_rs_geochron_ages_exp[rs_geochron_ages_exp_cols]
interested_rs_geochron_ages_exp.sample(5)

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    interested_rs_geochron_ages_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'], 
    suffixes=('', '_ages'))
del interested_rs_geochron_ages_exp
extracted_rs_data.head()

### Load the geochronical results data

In [None]:
sarig_rs_geochron_reslt_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_geochron_reslt_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_rs_geochron_reslt_exp['ANALYSIS_DATE'] = pd.to_datetime(sarig_rs_geochron_reslt_exp['ANALYSIS_DATE'])
sarig_rs_geochron_reslt_exp.sample(5)

In [None]:
sarig_rs_geochron_reslt_exp.info()

In [None]:
rs_geochron_reslt_exp_cols = ['INTERPRETATION_NO', 'INTERPRETATION_GROUP_NO', 'SAMPLE_NO',
       'SAMPLE_ANALYSIS_NO', 'ANALYSIS_TYPE', 'SPECIMEN_TYPE',
       'MIN_SEPARATE_MINERAL', 'STRATIGRAPHIC_UNIT', 'MAP_SYMBOL',
       'ROCK_GROUP', 'LITHOLOGY', 'ANALYSIS_POINT_NO', 'ANALYSIS_POINT_ID',
       'POINT_MINERAL', 'ANALYTE', 'VALUE', 'UNIT', 'UNCERTAINTY_VALUE',
       'UNCERTAINTY_UNIT', 'ANALYSIS_METHOD_CODE', 'SITE_NO']
interested_rs_geochron_reslt_exp = sarig_rs_geochron_reslt_exp[rs_geochron_reslt_exp_cols]
interested_rs_geochron_reslt_exp.sample(5)

In [None]:
unit_pivot_geochron_reslt_exp = pd.pivot_table(sarig_rs_geochron_reslt_exp[['SITE_NO', 'SAMPLE_NO', 'VALUE', 'UNIT'] ], 
               values='VALUE', index=['SITE_NO', 'SAMPLE_NO'], columns=['UNIT']).reset_index()
interested_rs_geochron_reslt_exp = interested_rs_geochron_reslt_exp.drop(['VALUE', 'UNIT'], axis=1).merge(
    unit_pivot_geochron_reslt_exp, 
    how='inner',
    on=['SITE_NO', 'SAMPLE_NO'])
del sarig_rs_geochron_reslt_exp

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    interested_rs_geochron_reslt_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'], 
    suffixes=('', '_res'))
del interested_rs_geochron_reslt_exp
extracted_rs_data.head()

### Load the petrology data

In [None]:
sarig_rs_petrology_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_rs_petrology_exp.csv','r'), 
    sep=',', encoding='latin1')

sarig_rs_petrology_exp.sample(5)

In [None]:
sarig_rs_petrology_exp.info()

In [None]:
rs_petrology_exp_cols = ['SAMPLE_NO', 'SAMPLE_ANALYSIS_NO', 'THIN_SECTION_NO',
       'HISTORICAL_PETROLOGY_NO', 'ROCK_TYPE_CODE', 'MINERALS_ABUNDANT',
       'MINERALS_MAJOR_ABUNDANCE', 'MINERALS_MINOR_ABUNDANCE',
       'MINERALS_TRACE_ABUNDANCE', 'MINERALS_RARE_ABUNDANCE',
       'MINERALS_UNKNOWN_ABUNDANCE', 'SITE_NO']
interested_rs_petrology_exp = sarig_rs_petrology_exp[rs_petrology_exp_cols]
interested_rs_petrology_exp.sample(5)

In [None]:
extracted_rs_data = extracted_rs_data.merge(
    interested_rs_petrology_exp, how='left', on=['SITE_NO', 'SAMPLE_NO'], 
    suffixes=('', '_petro'))
del interested_rs_petrology_exp
extracted_rs_data.sample(5)

### Export the merged rs data

In [None]:
extracted_rs_data.to_csv('./data/{}/extracted_rs_data.csv'.format(element_selected), sep=',', header='infer')