In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import easy_coloc

In [2]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [3]:
from math import radians, degrees, cos, sin, asin, sqrt, atan2

def bearing(lon1, lat1, lon2, lat2):

    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # formula for bearing
    dlon = lon1-lon2
    
    x = cos(lat2) * sin(dlon)

    y = cos(lat1) * sin(lat2) - sin(lat1) * cos(lat2) * cos(dlon)

    beta = atan2(x,y)

    return degrees(beta)

In [4]:
def bearing_between_stations(lat,lon):
    
    count=0

    theta = np.zeros(lat.shape)

    for lat_f,lon_f in zip(lat.values[1:],lon.values[1:]):

        theta[count+1] = bearing(lon.values[count], lat.values[count], lon_f, lat_f)

        count += 1
    
    theta = pd.DataFrame(pd.Series(np.ravel(theta),index=lon.index),columns=['bearing'])
        
    return theta

In [5]:
def distance_between_stations(lat,lon):
    
    count=0

    dx = np.zeros(lat.shape)

    for lat_f,lon_f in zip(lat.values[1:],lon.values[1:]):

        dx[count+1] = haversine(lon.values[count], lat.values[count], lon_f, lat_f)

        count += 1
    
    dx = pd.DataFrame(pd.Series(np.ravel(dx),index=lon.index),columns=['dx'])
        
    return dx

In [6]:
def section_lenth(lat,lon):
    
    dx = distance_between_stations(lat,lon)
    
    return np.cumsum(dx)

In [7]:
def identify_breaks(lat,lon,thresh=120):
    
    dx = distance_between_stations(lat,lon)
    
#     tem = pd.DataFrame({'dx':np.ravel(dx)},index=lon.index)
#     break_inds = tem[tem.dx > np.percentile(tem.dx,pct)].index.values
    break_inds = dx[dx.dx > thresh].index.values    
    return break_inds

In [8]:
def clean_section(lat,lon,thresh=120):
    
    break_inds = identify_breaks(lat,lon,thresh)
#     lat = lat.reset_index()
#     lon = lon.reset_index()

    return lat.drop(break_inds-1).reset_index().drop('index', axis=1),lon.drop(break_inds-1).reset_index().drop('index', axis=1)

In [9]:
glodap = pd.read_csv('GLODAPv2.2019_WOCE_GOSHIP.csv')
expc = pd.read_csv('FILTERED_GLODAP_EXPOCODE.csv')

In [10]:
expc

Unnamed: 0.1,Unnamed: 0,ID,EXPOCODE,LINE
0,14,15,06AQ20050122,A12
1,17,18,06AQ20071128,A12
2,18,19,06AQ20080210,A12
3,19,20,06AQ20101128,A12 SR04
4,22,23,06GA19960613,AR19
5,44,45,06MT19970815,AR07E
6,48,49,06MT20010507,A02
7,50,51,06MT20010717,A02
8,67,68,09AR19960822,SR03
9,68,69,09AR19980228,SR03


In [11]:
expc[expc.LINE.isin(['AO1W OVIDE'])]

Unnamed: 0.1,Unnamed: 0,ID,EXPOCODE,LINE


In [12]:
profiles = [
    '18SN19940724',
    '58GS20060721',
    '58JH19990615',
    '74AB19910614',
    '41SS19940301',
]

no_qc_needed = [
    '35PK20140515',
    '06AQ20071128',
    '64TR19900417',
    '29HE20010305',
    '29HE20020304',
    '29HE20130320',
    '06MT19970815',
    '06GA19960613',
    '58GS20060721',
    '58JH19990615',
    '33RR20090320',
    '74AB20020301',
    '49UP20150724',
    '325019971101',
]

In [13]:
coords_new = pd.DataFrame()

print('Lines that do not need QC:')
for cruise_id,line,expocode in zip(expc['ID'],expc['LINE'],expc['EXPOCODE']):
                       
    cruise = glodap[glodap['cruise']==cruise_id]
    cruise = cruise.groupby('station').mean()
    
    year = cruise.reset_index().year
    month = cruise.reset_index().month
    lon = cruise.reset_index().longitude
    lat = cruise.reset_index().latitude
    cid = cruise.reset_index().cruise
    
    if (expocode in no_qc_needed):
        print(expocode)
        new_lat,new_lon = lat,lon
        
    else:
        new_lat,new_lon = clean_section(lat,lon)
    
    theta = bearing_between_stations(new_lat,new_lon)
    dx = distance_between_stations(new_lat,new_lon)
    
    cid = cid.loc[0:len(new_lat)-1]
    new_lat.index = cid.index
    new_lon.index = cid.index
    
    if (expocode in profiles): #some cruises (i.g. IR01W) are well defined sections. Drop these from the section list
        
        cid = pd.DataFrame()
        new_lat = pd.DataFrame()
        new_lon = pd.DataFrame()
        theta = pd.DataFrame()
        dx = pd.DataFrame()
    
    tem = pd.concat([cid,new_lat,new_lon,dx,theta,month,year],axis=1,sort=True).reset_index()
    coords_new = pd.concat([coords_new,tem])

Lines that do not need QC:
06AQ20071128
06GA19960613
06MT19970815


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




29HE20010305
29HE20020304
29HE20130320
325019971101
33RR20090320
58GS20060721
58JH19990615
64TR19900417
74AB20020301
74AB20020301
35PK20140515
49UP20150724


In [14]:
coords_new = coords_new.reset_index()

In [15]:
coords_new = coords_new.drop(['level_0','index'],axis=1)

In [16]:
coords_new.to_csv('GLODAPv2.2019_COORDS.csv')

In [18]:
coords_new

Unnamed: 0,bearing,cruise,dx,latitude,longitude,month,year
0,0.000000,15.0,0.000000,-53.00800,0.03060,1.0,2005.0
1,178.251596,15.0,54.844928,-53.50100,0.00530,1.0,2005.0
2,179.993223,15.0,55.263879,-53.99800,0.00520,1.0,2005.0
3,-178.246834,15.0,57.181294,-54.51200,0.03230,1.0,2005.0
4,178.652229,15.0,54.500775,-55.00200,0.01220,2.0,2005.0
5,179.655321,15.0,55.487285,-55.50100,0.00690,2.0,2005.0
6,179.666518,15.0,7.561383,-55.56900,0.00620,2.0,2005.0
7,179.578260,15.0,48.148722,-56.00200,0.00050,2.0,2005.0
8,-179.865827,15.0,55.041642,-56.49700,0.00260,2.0,2005.0
9,-178.361315,15.0,51.727059,-56.96200,0.02700,2.0,2005.0
