In [1]:
from arguseyes.retrospective import PipelineRun, DataLeakageRetrospective

run = PipelineRun(run_id='bc07e7b4c8c54ee694078030860649b2')

In [2]:
run.show_source_code()

```Python
# https://github.com/mlflow/mlp-regression-example/
import logging

from typing import Dict, Any
import pandas as pd
from pandas import DataFrame
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

_logger = logging.getLogger(__name__)
pd.options.mode.chained_assignment = None


def load_file_as_dataframe(file_path: str, file_format: str) -> DataFrame:
    if file_format == "csv":
        import pandas
        return pandas.read_csv(file_path)
    elif file_format == "parquet":
        import pandas
        return pandas.read_parquet(file_path)
    else:
        raise NotImplementedError


def filter_dataset(dataset: DataFrame):
    filtered_dataset = dataset.dropna()
    filtered_dataset = filtered_dataset[filtered_dataset["fare_amount"] > 0]
    filtered_dataset = filtered_dataset[filtered_dataset["trip_distance"] < 400]
    filtered_dataset = filtered_dataset[filtered_dataset["trip_distance"] > 0]
    filtered_dataset = filtered_dataset[filtered_dataset["fare_amount"] < 1000]
    return filtered_dataset


def calculate_features(df: DataFrame):
    df["pickup_dow"] = df["tpep_pickup_datetime"].dt.dayofweek
    df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
    trip_duration = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["trip_duration"] = trip_duration.map(lambda x: x.total_seconds() / 60)
    dateTimeColumns = list(df.select_dtypes(include=['datetime64']).columns)
    for dateTimeColumn in dateTimeColumns:
        df[dateTimeColumn] = df[dateTimeColumn].astype(str)
    df.drop(columns=["tpep_pickup_datetime", "tpep_dropoff_datetime"], inplace=True)
    return df


def transformer_fn():
    return Pipeline(
        steps=[
            ("calculate_time_and_duration_features", FunctionTransformer(calculate_features)),
            ("encoder", ColumnTransformer(
                transformers=[
                    ("hour_encoder", OneHotEncoder(categories="auto", sparse=False), ["pickup_hour"],),
                    ("day_encoder", OneHotEncoder(categories="auto", sparse=False), ["pickup_dow"],),
                    ("std_scaler", StandardScaler(), ["trip_distance", "trip_duration"],),]),),
        ]
    )


def estimator_fn(estimator_params: Dict[str, Any] = {}):
    from sklearn.linear_model import SGDRegressor
    return SGDRegressor(random_state=42, **estimator_params)


data = load_file_as_dataframe('datasets/nyc-taxi/sample.parquet', 'parquet')
filtered_data = filter_dataset(data)

temporal_split_date = pd.to_datetime('2016-02-15')

train_data = filtered_data[filtered_data['tpep_dropoff_datetime'].dt.date <= temporal_split_date]
test_data = filtered_data[filtered_data['tpep_dropoff_datetime'].dt.date >= temporal_split_date]

model = Pipeline([
    ('featurization', transformer_fn()),
    ('learner', estimator_fn())
])

model.fit(train_data, train_data['fare_amount'])
print(model.score(test_data, test_data['fare_amount']))

```

In [3]:
run.explore_data()

# Pipeline Data Explorer

HBox(children=(CytoscapeWidget(cytoscape_layout={'name': 'dagre'}, cytoscape_style=[{'selector': 'node', 'css'…

In [4]:
retrospective = DataLeakageRetrospective(run)

In [5]:
leaked_data = retrospective.compute_leaked_tuples()
leaked_data

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip
37,2016-02-15 01:45:11,2016-02-15 01:48:40,0.60,4.5,10153,10065
57,2016-02-15 08:51:02,2016-02-15 09:06:27,5.50,17.0,11371,11379
73,2016-02-15 23:03:19,2016-02-15 23:38:15,11.40,35.0,11371,10011
77,2016-02-15 16:41:54,2016-02-15 17:38:20,18.43,52.0,11422,10011
144,2016-02-15 12:44:51,2016-02-15 13:07:39,3.48,16.5,10011,10022
...,...,...,...,...,...,...
9016,2016-02-15 21:02:47,2016-02-15 21:05:27,1.06,5.0,10035,10029
9045,2016-02-15 18:21:15,2016-02-15 18:32:51,1.51,9.0,10119,10103
9084,2016-02-15 06:46:32,2016-02-15 06:48:29,0.47,3.5,10044,10021
9113,2016-02-15 11:21:50,2016-02-15 11:27:38,0.84,5.5,10119,10199


In [6]:
leaked_data.trip_distance.describe()

count    177.000000
mean       3.016497
std        3.913846
min        0.300000
25%        1.000000
50%        1.600000
75%        2.870000
max       19.010000
Name: trip_distance, dtype: float64

In [7]:
leaked_data.tpep_pickup_datetime.dt.date.value_counts()

2016-02-15    175
2016-02-14      2
Name: tpep_pickup_datetime, dtype: int64

In [8]:
leaked_data.tpep_dropoff_datetime.dt.date.value_counts()

2016-02-15    177
Name: tpep_dropoff_datetime, dtype: int64