In [1]:
import pandas as pd
import dew_gwdata as gd

In [145]:
def erupt_dict(frame, key):
    main_frame = frame.drop([key], axis=1)
    erupted_frame = frame[key].apply(pd.Series).rename(columns=lambda x: key + "." + str(x))
    return pd.concat([main_frame, erupted_frame], axis=1)

def erupt(f):
    for col in f.columns:
        series = f[col]
        series_valid = series.dropna()
        if len(series_valid) > 0:
            example = series_valid.iloc[0]
            if isinstance(example, dict):
                return erupt(erupt_dict(f, col))
    return f

def erupt_and_clean(f):
    f = erupt(f)
    f = cleanup_numeric_unit_cols(f)
    f.columns = [c.replace("DischargeSummary.", "Summ.") for c in f.columns]
    return f

def explode(f, col):
    if col in f:
        f = f.explode(col)
    return f

def explode_erupt_and_clean(f, col):
    if col in f:
        f = f.explode(col)
    f = erupt_and_clean(f)
    return f

def cleanup_numeric_unit_cols(f):
    f = f[[c for c in f.columns if not c.endswith(".0") and not c.endswith(".Unit")]]
    f.columns = [c.replace(".Numeric", "") for c in f.columns]
    return f

def fixdf(f):
    for col in f.columns:
        series = f[col]
        if len(series) > 0:
            example = series.iloc[0]
            if isinstance(example, list):
                if len(example) > 0:
                    return fixdf(f.explode(col))
                else:
                    return fixdf(f.drop([col], axis=1))
            elif isinstance(example, dict):
                return fixdf(erupt_dict(f, col))
    return f

In [85]:
aq = gd.DEWAquarius("AQTS Prod")

In [86]:
r = aq.publish.get("GetFieldVisitDescriptionList", params={"LocationIdentifier": "A5050502"})



In [87]:
def fetch_field_visits_for_single_location(self, locid):
    r = self.publish.get("GetFieldVisitDescriptionList", params={"LocationIdentifier": locid})
    df = pd.io.json.json_normalize(r.json()["FieldVisitDescriptions"])
    df = df.rename(columns={"Identifier": "FieldVisitIdentifier"})
    return df

In [88]:
%%time
df = fetch_field_visits_for_single_location(aq, "A5050502")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 15 columns):
FieldVisitIdentifier                       141 non-null object
LocationIdentifier                         141 non-null object
StartTime                                  141 non-null object
EndTime                                    141 non-null object
Party                                      128 non-null object
IsValid                                    141 non-null bool
LastModified                               141 non-null object
CompletedWork.BiologicalSampleTaken        141 non-null bool
CompletedWork.GroundWaterLevelPerformed    141 non-null bool
CompletedWork.LevelsPerformed              141 non-null bool
CompletedWork.OtherSampleTaken             141 non-null bool
CompletedWork.RecorderDataCollected        141 non-null bool
CompletedWork.SedimentSampleTaken          141 non-null bool
CompletedWork.SafetyInspectionPerformed    141 non-null bool
CompletedWork.WaterQualitySam



In [148]:
%%time
def fetch_field_visit_data_for_single_location(self, locid, cleanup_columns=True):
    r = self.publish.get("GetFieldVisitDataByLocation", params={"LocationIdentifier": locid})
    df = pd.io.json.json_normalize(r.json()["FieldVisitData"])
#     df = df.rename(columns={"Identifier": "FieldVisitIdentifier"})    
    for dt_col in ["StartTime", "EndTime", "LastModified"]:
        df.loc[:, dt_col] = pd.to_datetime(df[dt_col].str[:19], format="%Y-%m-%dT%H:%M:%S")
    df = df[[c for c in df.columns if not c.startswith("CompletedWork.")]]
    df = df.drop(["Attachments"], axis=1)
    return df

df = fetch_field_visit_data_for_single_location(aq, "A5050502", cleanup_columns=False)
for old, new in (
    ("DischargeActivities", "DISCHARGE"),
    ("InspectionActivity", "INSPECT"),
    ("CrossSectionSurveyActivity", "XSECT"),
    ("LevelSurveyActivity", "LEVELS")
):
    df.columns = [c.replace(old, new) for c in df.columns]



Wall time: 1.74 s


In [152]:
for c in df.columns:
    for col in ["LEVELS", "INSPECT", "XSECT"]:
        if c.startswith(col):
            df = df.drop([c], axis=1)
df = explode_erupt_and_clean(df, "DISCHARGE")
df.columns = [c.replace("DischargeActivities.", ".") for c in df.columns]
# df = explode(df, "XSECT")
# df = explode(df, "INSPECT.Readings")
# df = explode(df, "LEVELS.LevelMeasurements")
# DISCHARGE.Adcp
# DISCHARGE.EngineeredStructure
# DISCHARGE.OtherMethod
# DISCHARGE.PointVelocity
# DISCHARGE.Volumetric
# df = df.explode("DISCHARGE.Summ.GageHeightReadings")
# df = df.pipe(erupt_and_clean)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 0 to 140
Data columns (total 40 columns):
Identifier                                                    179 non-null object
LocationIdentifier                                            179 non-null object
StartTime                                                     179 non-null datetime64[ns]
EndTime                                                       179 non-null datetime64[ns]
Party                                                         166 non-null object
IsValid                                                       179 non-null bool
LastModified                                                  179 non-null datetime64[ns]
Approval.ApprovalLevel                                        179 non-null int64
Approval.LevelDescription                                     179 non-null object
DISCHARGE.AdcpDischargeActivities                             162 non-null object
DISCHARGE.EngineeredStructureDischargeActivities      

KeyError: 0

In [127]:
df["DISCHARGE.Summ.GageHeightReadings"]

0      {'IsUsed': True, 'ReadingTime': '2017-09-08T10...
0      {'IsUsed': True, 'ReadingTime': '2017-09-08T11...
1      {'IsUsed': True, 'ReadingTime': '2015-06-11T13...
1      {'IsUsed': True, 'ReadingTime': '2015-06-11T13...
2      {'IsUsed': True, 'ReadingTime': '2013-10-15T09...
                             ...                        
138    {'IsUsed': True, 'ReadingTime': '1973-08-28T12...
139    {'IsUsed': True, 'ReadingTime': '1973-07-30T15...
139    {'IsUsed': True, 'ReadingTime': '1973-07-30T15...
140    {'IsUsed': True, 'ReadingTime': '1973-04-03T11...
140    {'IsUsed': True, 'ReadingTime': '1973-04-03T11...
Name: DISCHARGE.Summ.GageHeightReadings, Length: 581, dtype: object

In [123]:
df.expl"DISCHARGE.Summ.GageHeightReadings"]

0      [{'IsUsed': True, 'ReadingTime': '2017-09-08T1...
1      [{'IsUsed': True, 'ReadingTime': '2015-06-11T1...
2      [{'IsUsed': True, 'ReadingTime': '2013-10-15T0...
3      [{'IsUsed': True, 'ReadingTime': '2013-07-16T1...
4                                                    NaN
                             ...                        
136    [{'IsUsed': True, 'ReadingTime': '1974-05-31T1...
137    [{'IsUsed': True, 'ReadingTime': '1974-02-01T1...
138    [{'IsUsed': True, 'ReadingTime': '1973-08-28T1...
139    [{'IsUsed': True, 'ReadingTime': '1973-07-30T1...
140    [{'IsUsed': True, 'ReadingTime': '1973-04-03T1...
Name: DISCHARGE.Summ.GageHeightReadings, Length: 179, dtype: object

In [109]:
yal.to_csv("test.csv")

In [104]:
yal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304 entries, 0 to 140
Data columns (total 56 columns):
Attachments                                                   304 non-null object
Identifier                                                    304 non-null object
LocationIdentifier                                            304 non-null object
StartTime                                                     304 non-null datetime64[ns]
EndTime                                                       304 non-null datetime64[ns]
Party                                                         291 non-null object
IsValid                                                       304 non-null bool
LastModified                                                  304 non-null datetime64[ns]
Approval.ApprovalLevel                                        304 non-null int64
Approval.LevelDescription                                     304 non-null object
INSPECT.Readings                                      

In [105]:
yal.pipe(erupt)

Unnamed: 0,Attachments,Identifier,LocationIdentifier,StartTime,EndTime,Party,IsValid,LastModified,Approval.ApprovalLevel,Approval.LevelDescription,...,DISCHARGE.Summ.Reviewer,XSECT.Stage,DISCHARGE.Summ.Adjustment.AdjustmentType,DISCHARGE.Summ.Adjustment.ReasonForAdjustment,DISCHARGE.Summ.DifferenceDuringVisit,DISCHARGE.Summ.Discharge,DISCHARGE.Summ.DischargeUncertainty.ActiveUncertaintyType,DISCHARGE.Summ.DischargeUncertainty.QualitativeUncertainty,DISCHARGE.Summ.DurationInHours,DISCHARGE.Summ.MeanGageHeight
0,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",de6edd37-2dc0-4b98-a774-d95499250b4e,A5050502,2017-09-08 10:28:00,2017-09-08 11:04:00,DCR/PB,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,-0.002,1.3700,,Unknown,0.600000,1.690
1,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",e565d270-69ff-4e95-a768-45c4cb3bd633,A5050502,2015-06-11 13:34:00,2015-06-11 13:45:00,TS DCD,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,0.0124,,Unknown,0.183333,1.032
2,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",a43d1b06-bafe-4cdb-99eb-30cd0794f294,A5050502,2013-10-15 09:13:00,2013-10-15 09:35:00,DR / TS,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,0.0637,,Unknown,0.366667,1.104
3,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",295182fb-a704-4341-974d-d5d030dcd703,A5050502,2013-07-16 14:15:00,2013-07-16 15:01:00,DCR/KS,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,-0.002,0.2671,,Unknown,0.766667,1.300
4,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",a5157e2d-ff6e-4893-918a-5dcf60aedd9c,A5050502,2013-04-29 00:00:00,2013-04-29 03:00:00,PHANSEN,True,2020-03-18 14:59:29,1200,Approved,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",ecdbe2b0-2a1d-4da8-acea-26246e17e1f1,A5050502,1974-05-31 12:20:00,1974-05-31 12:20:00,TF,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,0.4010,,Unknown,0.000000,1.357
137,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",e7e9d6a1-eee8-4d24-b51f-8e3c1472ce2f,A5050502,1974-02-01 15:40:00,1974-02-01 15:40:00,JM,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,28.8100,,Unknown,0.000000,2.603
138,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",7c07d9d3-cc9f-470a-88c8-fdc226ad875c,A5050502,1973-08-28 12:15:00,1973-08-28 12:15:00,JLW,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,4.0900,,Unknown,0.000000,1.893
139,"[{'AttachmentType': 'FieldDataPlugin', 'Attach...",7aacbe1f-0fd1-4d7e-88b1-b716ee512e1d,A5050502,1973-07-30 15:45:00,1973-07-30 15:45:00,JM,True,2020-03-18 14:59:29,1200,Approved,...,,,Unknown,Unknown,0.000,0.5760,,Unknown,0.000000,1.472


In [24]:
dfx = pd.io.json.json_normalize(r.json()["FieldVisitData"])

In [26]:
dfxx = fixdf(dfx)

Unnamed: 0,Identifier,LocationIdentifier,StartTime,EndTime,Party,IsValid,LastModified,Approval.ApprovalLevel,Approval.LevelDescription,CompletedWork.BiologicalSampleTaken,...,DischargeActivities.PointVelocityDischargeActivities.Width.Numeric,DischargeActivities.PointVelocityDischargeActivities.Width.Unit,DischargeActivities.DischargeSummary.DischargeUncertainty.QuantitativeUncertainty.0,DischargeActivities.DischargeSummary.GageHeightReadings.GageHeight.0,DischargeActivities.DischargeSummary.GageHeightReadings.GageHeight.Numeric,DischargeActivities.PointVelocityDischargeActivities.DischargeChannelMeasurement.Discharge.0,DischargeActivities.PointVelocityDischargeActivities.DischargeChannelMeasurement.Discharge.Numeric,DischargeActivities.PointVelocityDischargeActivities.DischargeChannelMeasurement.Discharge.Unit,DischargeActivities.PointVelocityDischargeActivities.DischargeChannelMeasurement.DistanceToGage.0,DischargeActivities.PointVelocityDischargeActivities.DischargeChannelMeasurement.DistanceToGage.Unit
0,de6edd37-2dc0-4b98-a774-d95499250b4e,A5050502,2017-09-08T10:28:00.0000000+09:30,2017-09-08T11:04:00.0000000+09:30,DCR/PB,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,8.267,m,,,1.691,,1.3700,m^3/s,,m
0,de6edd37-2dc0-4b98-a774-d95499250b4e,A5050502,2017-09-08T10:28:00.0000000+09:30,2017-09-08T11:04:00.0000000+09:30,DCR/PB,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,8.267,m,,,1.689,,1.3700,m^3/s,,m
1,e565d270-69ff-4e95-a768-45c4cb3bd633,A5050502,2015-06-11T13:34:00.0000000+09:30,2015-06-11T13:45:00.0000000+09:30,TS DCD,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.032,,0.0124,m^3/s,,m
1,e565d270-69ff-4e95-a768-45c4cb3bd633,A5050502,2015-06-11T13:34:00.0000000+09:30,2015-06-11T13:45:00.0000000+09:30,TS DCD,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.032,,0.0124,m^3/s,,m
2,a43d1b06-bafe-4cdb-99eb-30cd0794f294,A5050502,2013-10-15T09:13:00.0000000+09:30,2013-10-15T09:35:00.0000000+09:30,DR / TS,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,4.500,m,,,1.104,,0.0637,m^3/s,,m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,7c07d9d3-cc9f-470a-88c8-fdc226ad875c,A5050502,1973-08-28T12:15:00.0000000+09:30,1973-08-28T12:15:00.0000000+09:30,JLW,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.893,,4.0900,m^3/s,,m
139,7aacbe1f-0fd1-4d7e-88b1-b716ee512e1d,A5050502,1973-07-30T15:45:00.0000000+09:30,1973-07-30T15:45:00.0000000+09:30,JM,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.472,,0.5760,m^3/s,,m
139,7aacbe1f-0fd1-4d7e-88b1-b716ee512e1d,A5050502,1973-07-30T15:45:00.0000000+09:30,1973-07-30T15:45:00.0000000+09:30,JM,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.472,,0.5760,m^3/s,,m
140,4f592b26-bafb-4bbf-bce0-3b99d0cb6e47,A5050502,1973-04-03T11:10:00.0000000+09:30,1973-04-03T11:10:00.0000000+09:30,JM,True,2020-03-18T14:59:29.3880024+09:30,1200,Approved,False,...,0.000,m,,,1.098,,0.0550,m^3/s,,m


In [15]:
e = d.explode("DischargeSummary.GageHeightReadings")
e.columns = [c.replace("DischargeSummary", "DS") for c in e.columns]
print(e.T)

                                                                                                0  \
VolumetricDischargeActivities                                                                  []   
EngineeredStructureDischargeActivities                                                         []   
PointVelocityDischargeActivities                [{'DischargeChannelMeasurement': {'Channel': '...   
OtherMethodDischargeActivities                                                                 []   
AdcpDischargeActivities                                                                        []   
DS.MeasurementStartTime                                         2017-09-08T10:28:00.0000000+09:30   
DS.MeasurementEndTime                                           2017-09-08T11:04:00.0000000+09:30   
DS.MeasurementTime                                              2017-09-08T10:46:00.0000000+09:30   
DS.Party                                                                                   

In [19]:
import excellentpandas

In [21]:
excellentpandas.show_in_excel(fixdf(e))

In [121]:
for col in out.columns:
    series = out[col]
    print(col)
    print(type(series.iloc[0]))

VolumetricDischargeActivities
<class 'list'>
EngineeredStructureDischargeActivities
<class 'list'>
OtherMethodDischargeActivities
<class 'list'>
AdcpDischargeActivities
<class 'list'>
DS.MeasurementStartTime
<class 'str'>
DS.MeasurementEndTime
<class 'str'>
DS.MeasurementTime
<class 'str'>
DS.Party
<class 'str'>
DS.BaseFlow
<class 'str'>
DS.Adjustment.AdjustmentType
<class 'str'>
DS.Adjustment.ReasonForAdjustment
<class 'str'>
DS.AlternateRatingDischarge.Unit
<class 'str'>
DS.Discharge.Unit
<class 'str'>
DS.Discharge.Numeric
<class 'numpy.float64'>
DS.DischargeMethod
<class 'str'>
DS.MeanGageHeight.Unit
<class 'str'>
DS.MeanGageHeight.Numeric
<class 'numpy.float64'>
DS.MeanGageHeightMethod
<class 'str'>
DS.MeanIndexVelocity.Unit
<class 'str'>
DS.DischargeMeasurementReason
<class 'str'>
DS.Comments
<class 'str'>
DS.GageHeightCalculation
<class 'str'>
DS.DifferenceDuringVisit.Numeric
<class 'numpy.float64'>
DS.DurationInHours.Numeric
<class 'numpy.float64'>
DS.QualityAssuranceComments
<c

In [90]:
e.drop(["DS.GageHeightReadings"], axis=1)

Unnamed: 0,VolumetricDischargeActivities,EngineeredStructureDischargeActivities,PointVelocityDischargeActivities,OtherMethodDischargeActivities,AdcpDischargeActivities,DS.MeasurementStartTime,DS.MeasurementEndTime,DS.MeasurementTime,DS.Party,DS.BaseFlow,...,DS.DurationInHours.Numeric,DS.QualityAssuranceComments,DS.DischargeUncertainty.ActiveUncertaintyType,DS.DischargeUncertainty.QualitativeUncertainty,DS.MeasurementGrade,DS.GradeCode,DS.MeasurementId,DS.Reviewer,DS.IsValid,DS.Publish
0,[],[],[{'DischargeChannelMeasurement': {'Channel': '...,[],[],2017-09-08T10:28:00.0000000+09:30,2017-09-08T11:04:00.0000000+09:30,2017-09-08T10:46:00.0000000+09:30,DCR/PB,Unknown,...,0.6,,,Unknown,Unknown,30,162,,True,True
0,[],[],[{'DischargeChannelMeasurement': {'Channel': '...,[],[],2017-09-08T10:28:00.0000000+09:30,2017-09-08T11:04:00.0000000+09:30,2017-09-08T10:46:00.0000000+09:30,DCR/PB,Unknown,...,0.6,,,Unknown,Unknown,30,162,,True,True


In [95]:
e["DS.GageHeightReadings"].apply(pd.Series).rename(columns=lambda x: "DS.GageHeightReadings." + str(x))

Unnamed: 0,DS.GageHeightReadings.IsUsed,DS.GageHeightReadings.ReadingTime,DS.GageHeightReadings.GageHeight
0,True,2017-09-08T10:28:00.0000000+09:30,{'Numeric': 1.691}
0,True,2017-09-08T11:04:00.0000000+09:30,{'Numeric': 1.689}


In [96]:
pd.concat([Out[90], Out[95]], axis=1).T

Unnamed: 0,0,0.1
VolumetricDischargeActivities,[],[]
EngineeredStructureDischargeActivities,[],[]
PointVelocityDischargeActivities,[{'DischargeChannelMeasurement': {'Channel': '...,[{'DischargeChannelMeasurement': {'Channel': '...
OtherMethodDischargeActivities,[],[]
AdcpDischargeActivities,[],[]
DS.MeasurementStartTime,2017-09-08T10:28:00.0000000+09:30,2017-09-08T10:28:00.0000000+09:30
DS.MeasurementEndTime,2017-09-08T11:04:00.0000000+09:30,2017-09-08T11:04:00.0000000+09:30
DS.MeasurementTime,2017-09-08T10:46:00.0000000+09:30,2017-09-08T10:46:00.0000000+09:30
DS.Party,DCR/PB,DCR/PB
DS.BaseFlow,Unknown,Unknown


In [50]:
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [51]:
flatten(a, sep=".")

AttributeError: 'list' object has no attribute 'items'