## Surface Observation Data Processing

In [None]:
# import required packages
import pandas as pd
import os
import pickle
import sys
pd.options.display.width=None
pd.options.display.max_columns=None


if sys.version_info >= (3, 6):
    from zipfile import ZipFile as zipfile
else:
    import zipfile36 as zipfile
    
url = "https://unearthed-exploresa.s3-ap-southeast-2.amazonaws.com/Unearthed_5_SARIG_Data_Package.zip" 
# enter the directory to save data
data_loc = './data'
file_name = 'Unearthed_5_SARIG_Data_Package.zip'

if os.path.isfile(os.path.join(data_loc, file_name)):
    print ("File exist")
    pass
else:
    # open and save the zip file onto computer
    url = urlopen(URL)
    output = open('./data/Unearthed_5_SARIG_Data_Package.zip', 'wb')    # note the flag:  "wb"        
    output.write(url.read())
    output.close()
    
files_in_dataset = []
file_name = 'Unearthed_5_SARIG_Data_Package.zip'
for file in zipfile(os.path.join(data_loc, file_name),'r').filelist:
    files_in_dataset.append(file.filename)
    
files_in_dataset



For this part of data cleaning, we will only use the following files: 
 - 'SARIG_Data_Package/sarig_fieldobs_exp.csv',
 - 'SARIG_Data_Package/sarig_fieldobs_litho_exp.csv',
 - 'SARIG_Data_Package/sarig_fieldobs_note_exp.csv',
 - 'SARIG_Data_Package/sarig_fieldobs_struct_exp.csv'

### Load Field Observation Data

In [None]:
# load the field observation summary data
sarig_fieldobs_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_fieldobs_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_fieldobs_exp.sample(5)

In [None]:
interested_fieldobs_exp = sarig_fieldobs_exp[['SITE_NO', 'FIELD_ID', 
       'LANDFORM_CODE', 'MAP_250000', 'MAP_100000', 'MAP_50000',
       'EASTING_GDA2020', 'NORTHING_GDA2020', 'ZONE_GDA2020',
       'LONGITUDE_GDA2020', 'LATITUDE_GDA2020', 'LONGITUDE_GDA94',
       'LATITUDE_GDA94', 'HORIZ_ACCRCY_M', 'ELEVATION_M', 'VERT_ACCRCY_M',
       'SURVEY_METHOD_CODE']]
interested_fieldobs_exp.sample(5)

In [None]:
sarig_fieldobs_exp.columns

In [None]:
sarig_fieldobs_exp.info()

This data table includes the summary of the contents of the field observation dataset. 

### Load the Field Observation Lithology Data

In [None]:
sarig_fieldobs_litho_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_fieldobs_litho_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_fieldobs_litho_exp['STRAT_UNIT_NO'] = sarig_fieldobs_litho_exp['STRAT_UNIT_NO'].astype('Int64')
sarig_fieldobs_litho_exp['OBSERVATION_DATE'] = pd.to_datetime(sarig_fieldobs_litho_exp['OBSERVATION_DATE'])
sarig_fieldobs_litho_exp.sample(5)

In [None]:
sarig_fieldobs_litho_exp.info()

In [None]:
interested_fieldobs_litho_exp = sarig_fieldobs_litho_exp[
    ['SITE_NO', 'LITHOLOGY_NO', 'MAP_SYMBOL', 'GIS_CODE', 'STRAT_UNIT_NO', 
     'ROCK_GROUP_CODE', 'LITHO_CODE', 'LITHO_CONFIDENCE', 'LITHO_MODIFIER']]
interested_fieldobs_litho_exp.sample(5)

### Load Field Observation Notes

In [None]:
sarig_fieldobs_note_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_fieldobs_note_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_fieldobs_litho_exp['OBSERVATION_DATE'] = pd.to_datetime(sarig_fieldobs_litho_exp['OBSERVATION_DATE'])
sarig_fieldobs_note_exp.sample(5)

The FIELD_NOTES are words, which might require text mining, while these FIELD_NOTES does not give too much insight into the ores. So, we ignore this table. 

### Load Field Observation Geological Structure Data 

In [None]:
sarig_fieldobs_struct_exp = pd.read_csv(
    zipfile(os.path.join(data_loc, file_name),'r').open('SARIG_Data_Package/sarig_fieldobs_struct_exp.csv','r'), 
    sep=',', encoding='latin1')
sarig_fieldobs_struct_exp.sample(5)

In [None]:
sarig_fieldobs_struct_exp.info()

In [None]:
interest_fieldobs_struct_exp = sarig_fieldobs_struct_exp[['SITE_NO', 'LITHOLOGY_NO', 'STRUCTURE_NO', 'STRUCTRE_CODE', 
    'LOCAL_GENERATION', 'OROGENY', 'REGIONAL_GENERATION', 'AZIMUTH', 
    'AZIMUTH_ACCURACY', 'AZIMUTH_MEASUREMENT_TYPE', 'INCLINATION', 
    'INCLINATION_ACCURACY']]

### Merge Data Tables

In [None]:
merged_fieldobs_exp = interested_fieldobs_exp.merge(
    interested_fieldobs_litho_exp, how='inner', on='SITE_NO').merge(
    interest_fieldobs_struct_exp, how='inner', on=['SITE_NO','LITHOLOGY_NO'])

In [None]:
merged_fieldobs_exp.sample(10)

In [None]:
merged_fieldobs_exp.info()

### Export Merged Dataset

In [None]:
path = '.\\data'
for directory in os.listdir(path):
    if os.path.isfile(os.path.join(path, directory)):
        pass
    else:
        new_path = os.path.join(path, directory)
        merged_fieldobs_exp.to_csv(
            os.path.join(new_path, 'merged_fieldobs_exp.csv'), 
            sep=',', 
            header='infer')