In [1]:
import collections
import inspect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import graphiclog
from graphiclog import utils, io

In [39]:
transforms = {c : utils.string2array for c in ['depth_m', 'grain_size_mm']}

beds = pd.read_csv('../data/AllBedsWithProfiles.csv', converters=transforms)
beds.head()

Unnamed: 0,name,count,collection,eod,eodnum,tops,th,gs_tops_mm,snd_shl,mean_gs_mm,max_gs_mm,ng,ar,depth_m,grain_size_mm
0,Marnoso 1,1,Marnoso-Arenacea,basin plain,0,22.81684,0.31082,0.115051,1.0,0.116073,0.117046,0.520514,0.021978,"[22.8168, 22.703, 22.5337, 22.506]","[0.1151, 0.1157, 0.117, 0.117]"
1,Marnoso 1,1,Marnoso-Arenacea,basin plain,0,22.50602,0.60931,0.001,0.0,0.001,0.001,0.520514,0.021978,"[22.506, 21.8967]","[0.001, 0.001]"
2,Marnoso 1,1,Marnoso-Arenacea,basin plain,0,21.89671,0.10463,0.12538,1.0,0.12538,0.12538,0.520514,0.021978,"[21.8967, 21.7921]","[0.1254, 0.1254]"
3,Marnoso 1,1,Marnoso-Arenacea,basin plain,0,21.79208,0.11694,0.001,0.0,0.001,0.001,0.520514,0.021978,"[21.7921, 21.6751]","[0.001, 0.001]"
4,Marnoso 1,1,Marnoso-Arenacea,basin plain,0,21.67514,0.13232,0.134306,1.0,0.150073,0.16042,0.520514,0.021978,"[21.6751, 21.6351, 21.5674, 21.5428]","[0.1343, 0.1447, 0.1604, 0.1604]"


In [13]:
beds[beds.gs_tops_mm.isnull()].th.sum()

1331.0010378194631

In [14]:
beds[beds.snd_shl.isnull()].th.sum()

1348.2524256812137

In [19]:
beds.snd_shl.value_counts()

1.0    28765
0.0    27253
Name: snd_shl, dtype: int64

In [22]:
beds[beds.snd_shl == 1].th.sum()

12735.29191211368

In [23]:
beds[beds.snd_shl == 0].th.sum()

5652.595944427258

In [17]:
2**-8, 2**-4

(0.00390625, 0.0625)

In [36]:
count = {eod : 0 for eod in beds.eod.unique()}
meters = {eod : 0 for eod in beds.eod.unique()}

for _, grp in beds.groupby('count'):
    count[grp['eod'].values[0]] += 1
    meters[grp['eod'].values[0]] += grp.th.sum()
    
avgs = {k : meters[k] / count[k] for k in count.keys()}

In [37]:
avgs

{'basin plain': 27.07570545450642,
 'fan': 33.39089495971464,
 'levee': 15.039543988376764,
 'slopechannel': 80.6872487607637}

In [38]:
beds.th.mean()

0.346765181098518

In [6]:
weird = {problem : [] for problem in ['has_null', 'th_mismatch', 'samples_mismatch']}

for name, seq in beds.groupby('count'):
    
    if seq.isnull().any().any():
        weird['has_null'].append(name)
    
    _, elev_good = io.check_thicknesses(seq, 'tops', 'th', 'elevation')
    _, depth_good = io.check_thicknesses(seq, 'tops', 'th', 'depth')
    if not (elev_good or depth_good):
        weird['th_mismatch'].append(name)
        
    if not io.check_samples(seq, 'depth_m', 'grain_size_mm'):
        weird['samples_mismatch'].append(name)
        
weird, {k : len(v) for k, v in weird.items()}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[basecol] = bases


({'has_null': [12,
   17,
   18,
   24,
   25,
   32,
   36,
   43,
   47,
   51,
   52,
   54,
   55,
   56,
   57,
   58,
   63,
   71,
   75,
   76,
   85,
   86,
   106,
   108,
   115,
   123,
   124,
   139,
   146,
   148,
   149,
   150,
   152,
   155,
   157,
   161,
   163,
   176,
   179,
   180,
   182,
   183,
   184,
   185,
   186,
   190,
   214,
   224,
   227,
   229,
   230,
   233,
   234,
   236,
   237,
   238,
   239,
   240,
   250,
   269,
   275,
   276,
   277,
   284,
   285,
   286,
   288,
   289,
   290,
   292,
   293,
   295,
   296,
   297,
   299,
   300,
   301,
   302,
   303,
   305,
   306,
   309,
   310,
   311,
   313,
   314,
   315,
   317,
   318,
   319,
   320,
   321,
   323,
   324,
   325,
   326,
   327,
   328,
   329,
   331,
   332,
   333,
   334,
   335,
   336,
   340,
   348,
   385,
   388,
   403,
   408,
   409,
   412,
   413,
   419,
   421,
   429,
   430,
   440,
   442,
   447,
   448,
   451,
   453,
   454],
  'th_mis

In [7]:
beds[beds.depth_m.apply(utils.safelen) != beds.grain_size_mm.apply(utils.safelen)]

Unnamed: 0,name,count,collection,eod,eodnum,tops,th,gs_tops_mm,snd_shl,mean_gs_mm,max_gs_mm,ng,ar,depth_m,grain_size_mm


In [8]:
weird, {k : len(v) for k, v in weird.items()}

({'has_null': [12,
   17,
   18,
   24,
   25,
   32,
   36,
   43,
   47,
   51,
   52,
   54,
   55,
   56,
   57,
   58,
   63,
   71,
   75,
   76,
   85,
   86,
   106,
   108,
   115,
   123,
   124,
   139,
   146,
   148,
   149,
   150,
   152,
   155,
   157,
   161,
   163,
   176,
   179,
   180,
   182,
   183,
   184,
   185,
   186,
   190,
   214,
   224,
   227,
   229,
   230,
   233,
   234,
   236,
   237,
   238,
   239,
   240,
   250,
   269,
   275,
   276,
   277,
   284,
   285,
   286,
   288,
   289,
   290,
   292,
   293,
   295,
   296,
   297,
   299,
   300,
   301,
   302,
   303,
   305,
   306,
   309,
   310,
   311,
   313,
   314,
   315,
   317,
   318,
   319,
   320,
   321,
   323,
   324,
   325,
   326,
   327,
   328,
   329,
   331,
   332,
   333,
   334,
   335,
   336,
   340,
   348,
   385,
   388,
   403,
   408,
   409,
   412,
   413,
   419,
   421,
   429,
   430,
   440,
   442,
   447,
   448,
   451,
   453,
   454],
  'th_mis

In [9]:
for name in weird['th_mismatch']:
    seq = beds[beds.name == name]
    print(name)
    print(seq[['tops', 'th']])

67
Empty DataFrame
Columns: [tops, th]
Index: []


In [10]:
magellan = beds[beds['count'] == 67]
magellan['depth_size'] = magellan.depth_m.apply(utils.safelen)
magellan['gs_size'] = magellan.grain_size_mm.apply(utils.safelen)
magellan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,count,collection,eod,eodnum,tops,th,gs_tops_mm,snd_shl,mean_gs_mm,max_gs_mm,ng,ar,depth_m,grain_size_mm,depth_size,gs_size
3379,Magellan 1320A,67,Gulf of Mexico,basin plain,0,0.00000,2.16803,0.001000,0.0,0.001000,0.001000,0.284765,0.025316,"[0.0, 2.168]","[0.001, 0.001]",2,2
3380,Magellan 1320A,67,Gulf of Mexico,basin plain,0,2.26382,0.37711,0.009893,0.0,0.009893,0.009893,0.284765,0.025316,"[2.2638, 2.6409]","[0.0099, 0.0099]",2,2
3381,Magellan 1320A,67,Gulf of Mexico,basin plain,0,2.64093,3.34637,0.001000,0.0,0.001000,0.001000,0.284765,0.025316,"[2.6409, 5.9873]","[0.001, 0.001]",2,2
3382,Magellan 1320A,67,Gulf of Mexico,basin plain,0,5.98730,1.22544,0.009467,0.0,0.009467,0.009467,0.284765,0.025316,"[5.9873, 7.2127]","[0.0095, 0.0095]",2,2
3383,Magellan 1320A,67,Gulf of Mexico,basin plain,0,7.21274,7.35263,0.123396,1.0,0.123396,0.123396,0.284765,0.025316,"[7.2127, 14.5654]","[0.1234, 0.1234]",2,2
3384,Magellan 1320A,67,Gulf of Mexico,basin plain,0,14.56537,0.32992,0.001000,0.0,0.001000,0.001000,0.284765,0.025316,"[14.5654, 14.8953]","[0.001, 0.001]",2,2
3385,Magellan 1320A,67,Gulf of Mexico,basin plain,0,14.89529,0.47126,0.124883,1.0,0.124883,0.124883,0.284765,0.025316,"[14.8953, 15.3666]","[0.1249, 0.1249]",2,2
3386,Magellan 1320A,67,Gulf of Mexico,basin plain,0,15.36655,0.28280,0.001000,0.0,0.001000,0.001000,0.284765,0.025316,"[15.3666, 15.6494]","[0.001, 0.001]",2,2
3387,Magellan 1320A,67,Gulf of Mexico,basin plain,0,15.64935,0.42423,0.123396,1.0,0.123396,0.123396,0.284765,0.025316,"[15.6494, 16.0736]","[0.1234, 0.1234]",2,2
3388,Magellan 1320A,67,Gulf of Mexico,basin plain,0,16.07358,0.32992,0.001000,0.0,0.001000,0.001000,0.284765,0.025316,"[16.0736, 16.4035]","[0.001, 0.001]",2,2


In [11]:
#beds['grain_size_psi'] = beds['grain_size_mm'].apply(graphiclog.wentworth.gs2psi)
#beds['mean_gs_psi'] = beds['mean_gs_mm'].apply(graphiclog.wentworth.gs2psi)

weird = {}


for i, row in beds.iterrows():
    len_ds = len(row['depth_m'])
    len_gs = len(row['grain_size_psi'])
    
    if len_ds != len_gs:
        if len_ds > 0:
            print(f'Weird row: {row}')
        else:
            beds.at[i, 'depth_m'] = np.array([row['tops'], beds.at[i+1, 'tops']])

beds

KeyError: 'grain_size_psi'

In [None]:
beds['mean_gs_psi'].isna().any()

In [None]:
seqs = []

bad_names = []

for i, (name, grp) in enumerate(beds.groupby('name')):
    if grp['mean_gs_psi'].isna().any():
        bad_names.append(name)
        continue
    
    bedseq = graphiclog.BedSequence.from_dataframe(grp, topcol='tops', thickcol='th',
                                         component_map=graphiclog.defaults.DEFAULT_COMPONENT_MAP,
                                         datacols=['th', 'mean_gs_psi', 'depth_m', 'grain_size_psi'],
                                         metacols=['name', 'collection', 'eod', 'eodnum'], eps=1e-2)
    
    seqs.append(bedseq)

In [None]:
bad_names

In [None]:
def match_metacol(seqs, value, metacol='eodnum'):
    return list(filter(lambda s: s.metadata[metacol] == value, seqs))

eod0 = match_metacol(seqs, 0)
eod1 = match_metacol(seqs, 1)
eod2 = match_metacol(seqs, 2)
eod3 = match_metacol(seqs, 3)

[len(e) for e in [eod0, eod1, eod2, eod3]]

In [None]:
import random

for i, seqs in enumerate([eod0, eod1, eod2, eod3]):
    selected = random.sample(seqs, 10)
    fig, axes = plt.subplots(ncols=10, figsize=(70, 30))
    for ax, seq in zip(axes, seqs):
        #seq.resample_data('depth_m', 0.01)
        seq.plot(legend=graphiclog.defaults.litholegend, width_field='grain_size_psi', depth_field='depth_m', ax=ax)
    
    plt.show()
    

In [None]:
eod0[1].metadata

In [None]:
x = beds[beds.name == 'Magellan 1320A']
x #[x.th.between(2.1, 2.2)]