In [1]:
from arguseyes import ArgusEyes
import logging

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

eyes = ArgusEyes('demo-bojan2', '../../mlruns')

In [2]:
pipeline = eyes.classification_pipeline_from_py_file('./amazon-reviews.py')

INFO:Patching sys.argv with ['eyes']
INFO:Created run 6de4416c249e4b859bc54f98f6ec3a33 for this invocation
INFO:Executing instrumented user pipeline with mlinspect
INFO:Redirecting the pipeline's stdout to arguseyes-pipeline-output.txt
INFO:Identifying training sources
INFO:Found dimension table from operator 1 with 10936 records and the following attributes: ['product_id', 'product_parent', 'product_title', 'category_id', 'mlinspect_lineage']
INFO:Found dimension table from operator 2 with 2 records and the following attributes: ['id', 'category', 'mlinspect_lineage']
INFO:Found dimension table from operator 3 with 246560 records and the following attributes: ['review_id', 'star_rating', 'helpful_votes', 'total_votes', 'mlinspect_lineage']
INFO:Found fact table from operator 0 with 246560 records and the following attributes: ['marketplace', 'customer_id', 'review_id', 'product_id', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date', 'mlinspect_lineage']
INFO

In [3]:
pipeline.X_train.shape

(24214, 104)

In [4]:
pipeline.y_test.shape

(2861, 1)

In [5]:
for train_source in pipeline.train_sources:
    print(train_source.source_type, train_source.data.columns)

SourceType.DIMENSION Index(['product_id', 'product_parent', 'product_title', 'category_id',
       'mlinspect_lineage'],
      dtype='object')
SourceType.DIMENSION Index(['id', 'category', 'mlinspect_lineage'], dtype='object')
SourceType.DIMENSION Index(['review_id', 'star_rating', 'helpful_votes', 'total_votes',
       'mlinspect_lineage'],
      dtype='object')
SourceType.FACTS Index(['marketplace', 'customer_id', 'review_id', 'product_id', 'vine',
       'verified_purchase', 'review_headline', 'review_body', 'review_date',
       'mlinspect_lineage'],
      dtype='object')


In [6]:
pipeline.train_sources[3].data

Unnamed: 0,marketplace,customer_id,review_id,product_id,vine,verified_purchase,review_headline,review_body,review_date,mlinspect_lineage
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31,"{LineageId(operator_id=0, row_id=0)}"
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,N,Y,Five Stars,Awesome,2015-08-31,"{LineageId(operator_id=0, row_id=1)}"
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31,"{LineageId(operator_id=0, row_id=2)}"
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,N,Y,Five Stars,Perfect,2015-08-31,"{LineageId(operator_id=0, row_id=3)}"
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,N,Y,Five Stars,Awesome!,2015-08-31,"{LineageId(operator_id=0, row_id=4)}"
...,...,...,...,...,...,...,...,...,...,...
246555,US,41754720,R19OFJV91M7D8X,B000YMR61A,N,N,"Easy to use, 1 comment 1 serious problem",I chose the deluxe version CD because of mortg...,2008-02-11,"{LineageId(operator_id=0, row_id=246555)}"
246556,US,51669529,R1I6G894K5AGG5,B000YMR61A,N,N,Schedule C IS for business- figures it would ...,"Schedule C IS for business, so figures it wou...",2008-02-08,"{LineageId(operator_id=0, row_id=246556)}"
246557,US,24731012,R17OE43FFEP81I,B000YMR5X4,N,N,Hassel to download,I wish that companies can test several scenari...,2008-02-05,"{LineageId(operator_id=0, row_id=246557)}"
246558,US,16049580,R15MGDDK63B52Z,B000YMR61A,N,N,beware of vista,i just installed turbotax deluxe 2007. If you ...,2008-02-05,"{LineageId(operator_id=0, row_id=246558)}"


In [7]:
from arguseyes.issues import LabelShift

pipeline.detect_issue(LabelShift())

Issue(id='label_shift', is_present=False, details={'threshold': 0.01, 'p_value': 0.012333776748022897})

In [8]:
from arguseyes.issues import TrainTestOverlap

pipeline.detect_issue(TrainTestOverlap())

Issue(id='traintest_overlap', is_present=False, details={'num_overlapping_records': 0})

In [9]:
from arguseyes.refinements import DataValuation

valued_source = pipeline.compute(DataValuation())


In [10]:
valued_source.data.sort_values(by='__arguseyes__shapley_value')

Unnamed: 0,marketplace,customer_id,review_id,product_id,vine,verified_purchase,review_headline,review_body,review_date,mlinspect_lineage,__arguseyes__shapley_value
6623,US,32645745,ROMK0DESUAGXM,B00F4CEOS8,N,Y,Bummer,You can't use this 4 xbox live,2015-06-30,"{LineageId(operator_id=0, row_id=6623)}",-0.081228
22267,US,1873730,R1GTSX1T67W9VT,B00F4CEWOY,N,Y,Five Stars,It works,2015-02-16,"{LineageId(operator_id=0, row_id=22267)}",-0.015819
13206,US,4884720,R3LKV31LDXZ80E,B00F4CFBUS,N,Y,One Star,Didn't come,2015-04-26,"{LineageId(operator_id=0, row_id=13206)}",-0.014562
6103,US,51193051,RLW7D0JVKSTFW,B00R6HA3XY,N,Y,"I don't know why, but this game is totally awe...","First of all, I want to admit that I am a Mort...",2015-07-06,"{LineageId(operator_id=0, row_id=6103)}",-0.011098
5727,US,37240099,R4B0L5TUQJHNM,B00F4CEWOY,N,Y,Easy Peasy,Quick and simple; it beats the heck out of hav...,2015-07-09,"{LineageId(operator_id=0, row_id=5727)}",-0.011067
...,...,...,...,...,...,...,...,...,...,...,...
28463,US,6762254,R21ADNR2NHLKXO,B00F4CF39C,N,Y,said it was use,didnt work,2015-01-05,"{LineageId(operator_id=0, row_id=28463)}",0.018728
17084,US,1078691,RU9CMHFAHGLA6,B00F4CF4PU,N,Y,,I bought this for an Xbox live deal but you ca...,2015-03-25,"{LineageId(operator_id=0, row_id=17084)}",0.018728
27490,US,12907607,RJ50WZWSQURCI,B001KC03ZE,N,Y,A Great Game,"I like Jewel Quest III, although not as well a...",2015-01-11,"{LineageId(operator_id=0, row_id=27490)}",0.022424
10525,US,12355424,R1GDH9VAISUPMZ,B007TY85PC,N,Y,Great,Great,2015-05-23,"{LineageId(operator_id=0, row_id=10525)}",0.022432


In [12]:
from mlinspect.inspections._inspection_input import OperatorType
from arguseyes.utils.dag_extraction import find_dag_node_by_type

dag_node_to_lineage_df = pipeline.dag_node_to_lineage_df
features_op = find_dag_node_by_type(OperatorType.TRAIN_DATA, dag_node_to_lineage_df.keys())
dag_node_to_lineage_df[features_op]

Unnamed: 0,array,mlinspect_lineage
0,"[-2.047682888564837, 1.0, 1.0, 1.0, 0.20412414...","{LineageId(operator_id=3, row_id=3193), Lineag..."
1,"[0.635357883280886, 1.0, 1.0, 1.0, 0.0, 0.0, 0...","{LineageId(operator_id=1, row_id=1), LineageId..."
2,"[0.635357883280886, 1.0, 1.0, 1.0, 0.0, 0.0, 0...","{LineageId(operator_id=2, row_id=0), LineageId..."
3,"[0.635357883280886, 1.0, 1.0, 1.0, 0.0, 0.0, 0...","{LineageId(operator_id=1, row_id=1), LineageId..."
4,"[0.635357883280886, 1.0, 1.0, 1.0, 0.088388347...","{LineageId(operator_id=2, row_id=0), LineageId..."
...,...,...
24209,"[0.635357883280886, 1.0, 1.0, 1.0, 0.0, -0.182...","{LineageId(operator_id=1, row_id=3943), Lineag..."
24210,"[-2.047682888564837, 1.0, 1.0, 1.0, 0.0, 0.0, ...","{LineageId(operator_id=2, row_id=0), LineageId..."
24211,"[-0.03540230968054473, 1.0, 1.0, 1.0, 0.0, -0....","{LineageId(operator_id=2, row_id=0), LineageId..."
24212,"[-0.7061625026419754, 1.0, 1.0, 1.0, 0.0, 0.17...","{LineageId(operator_id=2, row_id=0), LineageId..."


In [10]:
import mlflow
mlflow.end_run()