In [25]:
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
from transformers import AutoTokenizer
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [26]:
percentiles = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [27]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [28]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [29]:
gt_map: Dict[str, List[str]] = {}
dl_map: Dict[str, List[str]] = {}
for t in tqdm(train.itertuples()):
    pid = getattr(t, "Id")
    cl = getattr(t, "cleaned_label")
    dl = getattr(t, "dataset_label")
    if pid not in gt_map:
        gt_map[pid] = list()
    if pid not in dl_map:
        dl_map[pid] = list()
    gt_map[pid].append(cl)
    dl_map[pid].append(dl)

19661it [00:00, 327397.43it/s]


In [30]:
rows = []
sep = "|"
m = 1000
max_negative_examples = 20
max_length = 502
step = 50
for rid, gts in tqdm(gt_map.items()):
    is_multi = 0
    if len(gts) > 1:
        is_multi = 1
    ground_truth = sep.join(sorted(gts))
    dataset_labels = sep.join(sorted(dl_map[rid]))
    with open(f"input/train/{rid}.json") as in_file:
        sections = json.load(in_file)
    tmp = []
    for section in sections:
        tmp.append(section["text"])
    text = " ".join(tmp).strip()
    text = snlp.to_ascii_str(text)
    spans = []
    for dl in dl_map[rid]:
        i = 0
        j = len(text)
        while i < j - len(dl) + 1:
            p = text.find(dl, i, j)
            if p == -1:
                break
            q = p + len(dl)
            spans.append((p, q))
            i = q
            # Right context: answer is on the left edge of window
            a = p
            b = 2 * m + a
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                b -= step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_start"] = 0
            row["answer_end"] = len(dl)
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left context: answer is on the right edge of window
            b = q
            a = max(0, b - (2 * m))
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step * 2
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["answer_end"] = b - a
            row["answer_start"] = row["answer_end"] - len(dl) 
            row["context"] = text[a:b]
            row["context_token_length"] = _len
            rows.append(row)
            # Left and right context: answer is in the middle of window
            a = max(0, p - m)
            b = q + m
            _len = len(tokenizer(text[a:b])["input_ids"])
            while _len > max_length and a < b:
                a += step
                b -= step
                _len = len(tokenizer(text[a:b])["input_ids"])
            row = {}
            row["Id"] = rid
            row["is_multi"] = is_multi
            row["ground_truth"] = ground_truth
            row["dataset_labels"] = dataset_labels
            row["is_impossible"] = 0
            row["context"] = text[a:b]
            row["answer_start"] = row["context"].index(dl)
            row["answer_end"] = row["answer_start"] + len(dl)
            row["context_token_length"] = _len
            rows.append(row)
    i = 0
    negative_examples = int(len(text) / (2 * m)) - len(spans)
    negative_examples = min(max_negative_examples, negative_examples)
    while i < negative_examples:
        b = len(text) - 1 - (2 * m)
        a = random.randint(0, b)
        b = 2 * m + a
        is_overlap = False
        for span in spans:
            if a <= span[0] <= b or a <= span[1] <= b:
                is_overlap = True
                break
        if is_overlap:
            continue
        _len = len(tokenizer(text[a:b])["input_ids"])
        while _len > max_length and a < b:
            a += step
            b -= step
            _len = len(tokenizer(text[a:b])["input_ids"])
        row = {}
        row["Id"] = rid
        row["is_multi"] = is_multi
        row["ground_truth"] = ground_truth
        row["dataset_labels"] = dataset_labels
        row["is_impossible"] = 1
        row["context"] = text[a:b]
        row["answer_start"] = -1
        row["answer_end"] = -1
        row["context_token_length"] = _len
        rows.append(row)
        i += 1

  0%|          | 69/14316 [00:01<05:42, 41.65it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 14316/14316 [08:08<00:00, 29.33it/s] 


In [31]:
df = pd.DataFrame.from_records(rows)
cols = ["is_multi", "is_impossible"]
df[cols] = df[cols].astype(np.int8)
cols = ["answer_start", "answer_end", "context_token_length"]
df[cols] = df[cols].astype(np.int16) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360000 entries, 0 to 359999
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Id                    360000 non-null  object
 1   is_multi              360000 non-null  int8  
 2   ground_truth          360000 non-null  object
 3   dataset_labels        360000 non-null  object
 4   is_impossible         360000 non-null  int8  
 5   answer_start          360000 non-null  int16 
 6   answer_end            360000 non-null  int16 
 7   context               360000 non-null  object
 8   context_token_length  360000 non-null  int16 
dtypes: int16(3), int8(2), object(4)
memory usage: 13.7+ MB


In [32]:
df["is_multi"].value_counts(normalize=True)

0    0.524258
1    0.475742
Name: is_multi, dtype: float64

In [33]:
df["is_impossible"].value_counts(normalize=True)

0    0.50835
1    0.49165
Name: is_impossible, dtype: float64

In [34]:
assert (df["context_token_length"] <= max_length).all()
df["context_token_length"].describe(percentiles=percentiles)

count    360000.000000
mean        418.751139
std          58.558810
min           4.000000
1%          134.000000
5%          353.000000
10%         370.000000
20%         387.000000
30%         399.000000
40%         411.000000
50%         422.000000
60%         434.000000
70%         447.000000
80%         463.000000
90%         482.000000
95%         491.000000
99%         500.000000
max         502.000000
Name: context_token_length, dtype: float64

In [35]:
df.sample(20).head(20)

Unnamed: 0,Id,is_multi,ground_truth,dataset_labels,is_impossible,answer_start,answer_end,context,context_token_length
280982,fd4203de-fbf8-49ef-bbd4-430ec454cd68,1,coastal change analysis program|noaa c cap,Coastal Change Analysis Program|NOAA C-CAP,1,-1,-1,"mats could be found on eelgrass beds and bare substrate without distinguishable spectral variation, creating a broader range of spectral signatures found within the bay. Furthermore, there are two species of eelgrass that thrive in Willapa Bay, each with a unique signature. North American eelgrass (Zostera Marina) is larger leafed than Asian eelgrass (Zostera Japonica) and tends to occupy the inter-tidal zone (Backman, 1991;Phillips, 1984). The lighter hue of Asian eelgrass could easily be detected on the tidal flats, but could not be clearly distinguished from North American eelgrass in the inter-tidal zone. The combination of the two eelgrass species in the inter-tidal zone provided a unique spectral signature of its own, leading to three different signatures of eelgrass. There were other unidentifiable spectral signatures for which no source could be located. Several regions in the northern end of the study area had received sediment deposition following the flight mission and consequently no source for reflectance could be determined. The use of preliminary data gathered to measure spectral signatures was critical for the accurate interpretation of the Willapa Bay eelgrass photo set and will prove to be an integral component to future eelgrass distribution studies in Willapa Bay. Determining the size of the minimum mapping unit for this study proved to be problematic due to the presence of a non-native rush, Spartina spp., throughout Willapa Bay. C-CAP guidelines suggest that a minimum mapping unit of .03 hectares on the ground is appropriate for SRV distribution studies under ideal conditions. However, the prolific growth of Spartina spp., throughout Willapa Bay created a situation where countless island polygons of Spartina spp., clones would need to be excluded from eelgrass beds if the .03 hectare minimum mapping unit was used. To avoid mapping Spartina spp. rather than eelgrass, the minimum mapping unit size was increased to 1/10"" or 2.54 millimeters on the",407
320698,76d0526a-4dd6-478e-bd6e-d0f331e41cd3,0,slosh model,SLOSH model,1,-1,-1,"y in Moses Bayou is also impaired from bacteria, dioxin in edible tissue, and PCBs in edible tissue. Total maximum daily loads (TMDLs) in Moses Lake and Bayou for dioxin in edible tissue and PCBs in edible tissue are planned. Additional data is being collected before a management strategy is selected for the bacteria impairment.\nThere are two major tidal inlets into Corpus Christi Bay. Aransas Pass (Corpus Christi Ship Channel), between Mustang Island and San Jose Island, which accounts for the majority of the tidal exchange between the bay and the Gulf of Mexico. Packery Channel, between the southwestern end of Mustang Island and North Padre Island, is manmade inlet that supplies a lesser amount of the bay's tidal exchange. Overall, the natural depth of the bay is relatively shallow, with an average depth of approximately 9 feet. Tides in Corpus Christi Bay under normal conditions are very small in amplitude, usually less than 3 feet between low and high tide. Wind speed and direction within Corpus Christi Bay plays an important role in affecting tide elevation. It can dampen or enhance the height of waves as well as their potential energy. Prevailing winds are from the southeast, with occasional strong northerly winds that are associated with passing cold fronts. Winds combined with seasonal tide events can greatly exacerbate the tidal range as well as move the range up or down by 1 or 2 feet. Storm tides during Category 4 or 5 hurricanes could be as high as 15-20 feet above normal water levels according to NOAA's Sea, Lake, and Overland Surge from Hurricanes (SLOSH) Model. The project site is located along Indian Point in Upper Corpus Christi Bay. Indian Point is a small peninsula that extends from the northeastern shore and extends towards Rincon Point on the southwestern shore. The two peninsulas separate Nueces Bay from Upper Corpus Christi Bay. Conditions within project area are primarily influenced by Corpus Christi Bay. The hydrology of the area is affected",400
31928,1126d384-a39e-4e9a-9570-d11df665f77e,1,ibtracs|international best track archive for climate stewardship,IBTrACS|International Best Track Archive for Climate Stewardship,0,0,7,"IBTrACS data for any data source that reported both MSW and MCP. While this fitting method deviates from recommendations by KZ07 regarding how the fit should be performed (i.e., observations should be binned and then fit), the focus here is on interagency and interannual differences rather than the absolute accuracy of any one formula. Such an analysis shows how operational procedures might have changed over time. The WPR parameters are shown in Fig. 3 as a time series along with the root-mean-square (RMS) error of the empirical fit and V 920, the wind speed corresponding to MCP 5 920 hPa. The RMS values provide insights into how much an agency followed any WPR. For instance, AR ended in 1987, thus forcing agencies to estimate intensity from satellites more often, as is done in Dvorak (1984) . The result is that after 1987, the maximum RMS is small (about 6 kt) because both wind and pressure were derived from the same satellite estimate. Conversely, RMS values in the early record (e.g., before 1970) show RMS values exceeding 15 kt, implying a consistent WPR was not routinely used to constrain wind to pressure or vice versa. It is of note that the RMS rarely shows a step change in any time series. The RMSs for TD-9635 and CMA gradually drift from more than 10 kt to near 5 kt in the 1980s. An exception is the JTWC switch to a different WPR in 2007.\nThe impact of any change in procedures can be seen in the V 920 time series (Fig. 3, bottom) . Given the small bias compared to AR pressure data, any change in this value implies that there is likely a temporal bias in the reported wind speeds that could preclude climatic analysis (i.e., direct comparisons between years with vastly different V 920 ). The CMA and TD-9635 show a drift from 140-160 kt in the early record to 120 kt in the 1970s. It is likely a change in operational practice caused this gradual change. Given the previous validation of MCP from CMA (cf. Fig. 2) , it is possible that winds for CMA before 1970 are",470
333702,b96f4d84-14bf-46b5-b777-5dd3b40e8acc,0,census of agriculture,Census of Agriculture,1,-1,-1,"les; these two tasks require different types of capital, production inputs, and labor skills, which inhibit the farms' ability to diversify across these product categories.\nOnly 6.64% of the farms in the sample were located in an urban area. The concept of a CSA program is grounded in linking consumers to their food source and farmers. As a result, CSA programs sometimes require members to ""volunteer"" on the farm. Some volunteers might help with the planting, others with weeding during the growing season and others with the harvest. Approximately 15% of the CSA farms in our sample required members to work on the farm a mandatory number of hours. Proposition 1 predicts that the price difference between yield shares and weight shares will be determined by two potentially competing factors, the yield premium and the risk premium.\nGiven the anecdotal evidence discussed above, we expect that the yield premium will be near zero (though this is not testable directly in our data set) and that the risk premium will be positive (weight shares cost more) due to negative covariance of prices and yields. We conduct a regression analysis to test the resulting prediction that weight shares will be more costly, on average, and to estimate the extra cost under the assumption that the yield premium is zero.\nSpecifically, we model the price of a share as a function of the share type while controlling for other factors that affect share price, including the products produced by the farm and offered through the CSA program, the farming practices, the pick-up location, the state where the CSA farm is located, and other outlets through which the farm sells its products. We estimate the following equation using ordinary least squares (OLS). Our econometric model is a basic hedonic pricing approach. Firm-level data, in this case the prices charged by the CSA farms, can only be used to estimate hedonic models if price-cost markups are first eliminated and the competitiveness of the market is",399
722,21084fb9-ec0a-4da5-81dc-8863c9920f98,1,education longitudinal study|national education longitudinal study,Education Longitudinal Study|National Education Longitudinal Study,1,-1,-1,"ristics that are relatively changeable and observable to students and faculty. Global school characteristics were included to adequately assess the effect of attending a Catholic school by controlling for other important global school characteristics.\nVariables in this category were Catholic school status, enrollment size, average pre-test scores, average parental SES, percentage of minority students, and institutional location (urban, suburban, or rural) . Aggregate pre-test scores and mean SES were treated as global school characteristics because these variables must be controlled to assess the effects of Catholic schools. Internal school characteristics were included to understand the reasons for any effects of Catholic schools. Examining internal characteristics can also help us determine the kind of school policy or environment that can positively or negatively affect students' academic development. The internal school variables were monitoring of academic progress, strictness of school rules, extent of school's encouragement for parental support and involvement, teachers' morale, students' morale, and teacher-student ratios. See Appendix A for the list of all the variables and their coding schemes. We began the analysis by generating descriptive statistics such as means, standard deviations, and correlations. Table 1 presents the means and standard deviations of variables included in HLM analysis, as well as the correlation coefficients between the variables and Catholic schools. Except for Catholic school, mean pretests, student's perception of each subject's usefulness, and parental education, the listed variables had significant positive or negative effects on at least one of the outcomes when included with other predictors in the HLM models.\nTo test the null hypothesis that there is no significant difference in development of academic achievement in reading, mathematics, history/social studies, and science between Catholic and non-Catholic private secondar",352
187483,df236d27-6538-40ca-b25c-eb98ffcb1f03,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,0,4,"ADNI: www.adni-info.org). A further limitation of the present work is the significantly younger age of the control group in comparison to the patient cohort. However, this aspect might only have contributed to the discrimination between dementia patients and control subjects but not to the high classification accuracy of AD and FTLD patients as these were very similar in their age range. If age contributed to the classification accuracy there should be lower classification accuracy for young dementia patients and older control subjects as they did not differ in age. For comparison of both types of dementia patients and control subjects the classification accuracy did not differ for younger and older dementia patients although half of the patients were in the same age range as the control group. In AD group all patients were classified correctly. In FTLD group one younger and one older patient were misclassified. Independently of age, all control subjects were classified correctly for both comparisons. These results indicate that the slight mean age differences is not the decisive factor for the high discrimination accuracy using combined information from FDG-PET and MRI. Furthermore, if age still slightly contributed to the high discrimination of dementia patients and control subjects this contribution was also present in all other single modality and multimodal whole-brain and ROI-based SVM classifications applied in this study. Therefore, age cannot account for increased differentiation accuracies when combined ROI information from FDG-PET and MRI are used for differentiation of dementia patients and control subjects.\nAnother point is that subjects in the control group in our study reported subjective cognitive complaints which might have limited the interpretation of the results of our study. However, only subjects were included whose cognitive complaints were not confirmed by comprehensive neuropsychological evaluation. The CDR is a semi-structural interview and",357
219837,52ddbae0-b9d5-4090-b00c-f24a2d29e7cb,0,optimum interpolation sea surface temperature,Optimum Interpolation Sea Surface Temperature,0,1955,2000,"rk's tag with profiles of water temperature from IMOS floats in the same region where the shark was resident. The maximum depth of descent was assumed to be the greatest depth in the water temperature profile where the minimum temperatures reported by the tag and those of the water temperature profile were the same. For the track with multiple home range cores (Shark 5), movement patterns were categorised as within and outside the 25% utilisation distribution. We then used generalised linear models with a binomial distribution and a logit link function to assess the relationship between the probability of the shark being in a home range core and water temperature, bathymetry and region of the WA coast (north-latitude < 24S and south-latitude >24S). We were not able to fit all three explanatory variables in one model as there was no overlap of the temperature ranges between the two regions. Consequently, we fitted a model to examine the probability of being in a home range core in relation to bathymetry and region and two separate models (using the data from north and south coasts, respectively) to examine the probability of the shark being in a home range core in relation to sea surface temperature. To address the autocorrelation present in the data we used a matched-block bootstrap sampling for all models with replacement procedure [71, 72] that resampled blocks of data randomly and then recombined them in a random order, creating a bootstrapped dataset that minimized the effect of autocorrelation [71] [72] [73] . Model fitting was applied to 100 bootstrapped samples and model selection used the sample-corrected Akaike's information criterion (AIC c ), AIC c weight ( w AIC c ), and percent deviance explained (%DE) [74, 75] . Bathymetry data with a grid resolution of 2' from ETOPO1 database hosted by the NOAA was obtained by the R software package marmap [76] . Daily Sea Surface Temperature was obtained through the daily Optimum Interpolation Sea Surface Temperature",437
229950,8cb297a6-ad60-4761-824e-96f846b74c93,0,national assessment of education progress,National Assessment of Education Progress,1,-1,-1,"at the 1% level of significance thereby supporting the Alternate Hypothesis that the integration of economics in the curriculum has a statistically significant impact on economics outcome scores. Taking this analysis one step further we determined, again at the 1% level of significance, that in 1994, school systems that had integrated economics education in the curriculum scored on average 6 points higher than other school systems both at grade level 5 and 8; and 4 points higher at grade level 3. In 1995, it was deter-\nmined, again at the 1% level of significance that school systems that had integrated economics in the curriculum scored on average 8 points higher than other school systems at grade level 8; and 6 points higher at grade level 3 and 5. In 1997, it was determined, again at the 1% level of significance that school systems that had integrated economics in the curriculum scored on average 9 points higher than other school systems at grade level 3, 5, and 8. Finally, in 2000, it was determined that school systems that had integrated economics in the curriculum scored 4, 8, and 7 points higher than other school systems at grade level 3, 5, and 8, respectively. With mean economics outcome scores on the MSPAP often ranging in the twenties and thirties, a 4-9 point differential is significant. School systems where integration of economics in the curriculum is at one or less grade levels. Notes: * Significant at the 1% level Yes1: School systems with systematic integration of economics in the curriculum at 2 or more grade levels. No2 :\nSchool systems where integration of economics in the curriculum is at one or less grade levels. School systems where integration of economics in the curriculum is at one or less grade levels. Although the hypothesis testing undertaken in the previous section established a statistically significant correlation between economics outcome scores and the level of economics instruction it does not explain this relationship.\nIn the case",378
100677,7a9e289d-de6c-4b41-a0e1-336f92324b20,1,adni|alzheimer s disease neuroimaging initiative adni,ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI),0,856,860,"VEL correction are more enriched for brain regions known to undergo structural changes in AD. Finally, we show that the average hippocampal intensity after RAVEL correction performs better than intensity-normalized-only images in discriminating between AD patients and healthy controls, and between MCI patients and healthy controls. This shows that RAVEL-corrected T1-w intensities are more biologically meaningful than intensity-normalized-only images for group comparisons, and therefore potentially promising for the development of biomarkers.\nAlthough we apply RAVEL in the context of T1-w MRI of the brain, our method is generalizable to many imaging modalities. In addition, the flexibility in the choice of the control voxels makes RAVEL applicable to any disease or pathology. Our dataset consists of a subset of 917 subjects downloaded from the ADNI database (adni.loni.usc. edu). For each subject, we selected a study visit at random. We obtained 506, 184 and 227 subjects from the ADNI, ADNI-2 and ADNI-GO phases, respectively. We present summary statistics of the study population in Table 1 . The selected scans were acquired at 83 different imaging sites, with a median number of 10 patients per site. The scans were also well-balanced for disease status across sites. We considered T1-w imaging acquired on 1.5 and 3 T scanners according to the ADNI standardized protocol . All analysis was performed in R [R Core Team, 2014] , using the packages oro.nifti [Whitcher et al., 2011] , fslr , ANTsR [Avants et al., 2015] and WhiteStripe [Shinohara and Muschelli, 2015] .\nWe applied the N4 inhomogeneity correction algorithm [Tustison et al., 2010 ] to each image. We nonlinearly registered all T1-w images to a high-resolution T1-w image atlas [Oishi et al., 2010] , using the symmetric diffeomorphic image registration algorithm [Avants et al., 2008] implemented in the ANTs suite. We use non-linear registration in order to define a brain control region aligned across subjects and to fi",468
22818,ee7ce475-bb68-4d7e-a239-eb4c7bc77ee0,0,noaa tide gauge,NOAA Tide Gauge,1,-1,-1,"ential neighborhoods and some commercial development. Though some areas of the creek are open to shell fishing (at the lower reaches of the creek), Pages Creek has experienced a decline in the overall water quality. This creek has also seen an increase in bacteria levels during and immediately after rain events, indicating the impacts of storm water runoff from impervious surfaces [23] . As a result, Pages Creek is routinely monitored and regulated through a joint effort between federal and state agencies. [5] method may provide improvement to bathymetric mapping.\nPages Creek is located in southeastern North Carolina, United States, and drains approximately 4100 acres into the Atlantic Intracoastal Waterway (ICW). Approximately 4500 people live within the boundary of this watershed (Figure 1 ). The Pages Creek watershed consists mostly of residential neighborhoods and some commercial development. Though some areas of the creek are open to shell fishing (at the lower reaches of the creek), Pages Creek has experienced a decline in the overall water quality. This creek has also seen an increase in bacteria levels during and immediately after rain events, indicating the impacts of storm water runoff from impervious surfaces [23] . As a result, Pages Creek is routinely monitored and regulated through a joint effort between federal and state agencies. The application of remote sensing analyses to map marsh habitats and shallow water bathymetry has not been done using this combination of fieldwork, WV-2 imagery, and LiDAR data. \nThe application of remote sensing analyses to map marsh habitats and shallow water bathymetry has not been done using this combination of fieldwork, WV-2 imagery, and LiDAR data. Similar studies have successfully conducted mapping of marsh habitats using various combinations of imagery and classification methods; however, it has been difficult to distinguish between salt marsh species because of spectral similarities in small geographic areas [2,",375


In [36]:
df.to_parquet("output/train.parquet", index=False)