# Inspect data and add geometry

This is the most important step of all, since a bad geometry will invalidate every subsequent processing steps.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import rsf.api as sf

%matplotlib inline

# Set a custom DATAPATH to Madagascar
%env DATAPATH=/home/rodrigo/Projetos/Viking/DATAPATH/

In [None]:
sns.set_style('ticks')

In [None]:
raw_data = 'seismic.segy'

## Read a data sample of 200 traces

In [None]:
data_sample = 'sample_agc.rsf'

!sfsegyread n2=200 read=d < {raw_data} | sfagc > {data_sample}

In [None]:
!sfin {data_sample}

In [None]:
data_sample = sf.Input(data_sample)

In [None]:
n1 = data_sample.int('n1')
n2 = data_sample.int('n2')
d1 = data_sample.float('d1')
o1 = data_sample.float('o1')

In [None]:
data = np.zeros((n2, n1), dtype=np.float32)

In [None]:
data_sample.read(data)

In [None]:
perc = np.percentile(data.ravel(), 99)
t = np.arange(n1) * d1 + o1
extent = [0, n2, t[-1], o1]
plt.figure(figsize=(4, 6))
plt.imshow(data.clip(-perc, perc).T, aspect='auto', extent=extent)

plt.ylabel('t (s)')
plt.xlabel('traces')

plt.title('raw data sample w/ AGC')

sns.despine()

plt.tight_layout()

We can see the data above really well, every shot seem to be around 120 traces long. We also see coherent events, so we know that our data file is not corrupted.

## Inspect all the header and text headers

In [None]:
header_raw = 'line_12_header_raw.rsf'
text_header = 'line_12_text_header.txt'

!sfsegyread read=h < {raw_data} tfile={header_raw} hfile={text_header} > /dev/null

In [None]:
!cat {text_header} | fold -w 80

In [None]:
!sfheaderattr < {header_raw}

As I suspected, we have 120 traces per shot (tracf), also the geometry seem to be fairly complete, with cdp, shotpoint and offset all set. The text header does not provide us with any additional information.

In [None]:
# We need to convert the header to floating point before reading with python
header_float = 'line_12_header_float.rsf'

!sfdd type=float < {header_raw} > {header_float}

In [None]:
header = sf.Input(header_float)

In [None]:
n1 = header.int('n1')
n2 = header.int('n2')

In [None]:
data = np.zeros((n2, n1), dtype=np.float32)

In [None]:
header.read(data)

In [None]:
import pandas as pd

In [None]:
# Iterate over file w/ SEG-Y keyword/index relation
keys = {}

for index, row in pd.read_csv('keys.txt', sep=' ').iterrows():
    keys[row.key] = row.id

Now lets create a DataFrame with the line header. Pandas DataFrames are easier to manipulate.

In [None]:
header = pd.DataFrame()

for k in keys:
    header[k] = data[:,keys[k]]
    
header[['offset', 'ep', 'cdp', 'sx', 'gx']].head()

### Investigating the line geometry with some QC plots

In [None]:
plt.plot(header.ep, header.gx, '.', label='group')
plt.plot(header.ep, header.sx, '.', label='source')


plt.legend(loc='upper left')

plt.xlabel('shotpoint number')
plt.ylabel('distance along the line (m)')

plt.axis('tight')
sns.despine(offset=5)
plt.tight_layout()

In [None]:
plt.plot(header.ep, header.offset, ',')

plt.xlabel('shotpoint number')
plt.ylabel('offset (m)')

plt.axis('tight')
sns.despine(offset=5)
plt.tight_layout()

To properly manipulate the line geometry we need some extra keywords, like sequential shotpoint, proper cdpt, group number and trace number with group ensemble.

In [None]:
plt.plot(header.cdp, header.ep, ',')


plt.xlabel('cdp')
plt.ylabel('ep')

plt.axis('tight')
sns.despine(offset=5)
plt.tight_layout()

### Recovering the original group number

This survey is stack array, since 262 m/25 m ~ 10.5, so we can recover the group number with the following expression

In [None]:
header['grnofr'] = header.ep - 10.5 - np.abs(header.tracf - 120)

In [None]:
plt.plot(header.grnofr, header.ep, ',')

plt.xlabel('grnofr')
plt.ylabel('ep')

plt.axis('tight')
sns.despine(offset=5)
plt.tight_layout()

In [None]:
offset = (header['ep'] - header['grnofr'])  * 25
# 0.5 m is the difference between the real minimum offset and
# the offset to make this survey a perfect stack array
offset -= 0.5
# Also, all offsets are negative
offset = -offset

# Lets check if the provided offset and calculated offset matches
np.allclose(offset, header['offset'])

In [None]:
# Lets check if the sequential CDP provided and our
# calculated CDP matches

cdp = (header.ep + header.grnofr)/2
bins = cdp.unique()
np.allclose(header.cdp, np.digitize(cdp, bins))

In [None]:
# Now lets create a sequential group number for sorting purposes
bins = header.grnofr.unique()
header['grnors'] = np.digitize(header.grnofr, bins)

In [None]:
# We also need a equivalent to CDPT for for common group sorting

header['offset_bin'] = header.grnofr - header.ep
header['grnlof'] = header.groupby('grnors').offset_bin.apply(np.argsort)

In [None]:
# Since there is skips, we also need a sequential shotpoint
# lets recicle the fldr keyword
bins = header.ep.unique()
fldr = np.digitize(header.ep, bins)
header['fldr'] = fldr

## Recalculate a proper CDPT

The original CDPT has skips on it, we need a truly sequential CDPT, otherwise the data in CDP domain will be filled with many zeroed traces.

In [None]:
header['cdpt'] = header.groupby('cdp').offset_bin.apply(np.argsort)
header.drop('offset_bin', axis=1, inplace=True)

In [None]:
plt.plot(header.cdp, header.cdpt, ',')


plt.xlabel('cdp')
plt.ylabel('cdpt')

plt.axis('tight')
sns.despine(offset=5)
plt.tight_layout()

## Geometry summary

With the keywords listed below properly set we can sort the data to any domain we want (i.e. Common Mid Point Gather, Common Shotpoint Gather, Common Receiver Group Gather).

| keyword | usage |
|------|------|
| fldr | sequential shotpoint |
| ep | field shotpoint |
| tracf | trace number within common shot ensemble |
| cdp | sequential common mid point number |
| cdpt | trace number within common mid point ensemble |
| grnofr | field receiver group number |
| grnors | sequential receiver group number |
| grnlof | trace number within common receiver group ensemble |

Now, all we need to do is write a RSF file with the improved header and also read all trace data. Also, there is no need to resample the data, since its already in 4 ms sampling rate.

In [None]:
for k in keys:
    data[:,keys[k]] = header[k].values

header_complete = 'line_12_header_complete.rsf'

out = sf.Output(header_complete)
out.put('n1', n1)
out.put('n2', n2)

out.write(data)
out.close()

In [None]:
!sfin {header_complete}

In [None]:
# Now we just need to convert to int since Madagascar's sorting programs
# only work with integer headers
header_int = 'line_12_header_int.rsf'

!sfdd type=int < {header_complete} > {header_int}

# Convert full trace data to RSF format

In [None]:
common_shot = 'line_12_csg_raw.rsf'

!sfsegyread read=d < {raw_data} | sfput n2=120 n3=1001 > {common_shot}

In [None]:
!sfin {common_shot}