In [1]:
from io import BytesIO,StringIO
import datetime

from nanoHUB.application import Application
from nanoHUB.pipeline.geddes.data import get_default_s3_client
import botocore.client as s3client

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

application = Application.get_instance()
s3_client = get_default_s3_client(application)

bucket_name_raw = 'nanohub.raw'
bucket_name_processed = 'nanohub.processed'
buckets = ['nanohub.processed']

alg_types = ['mike', 'xufeng']

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
class DateParser:
    def create_time_probe(self, from_date: datetime, to_date: datetime) -> str:
        raise NotImplemented

        
class UnderscoredDateParser(DateParser):
    def create_time_probe(self, from_date: datetime, to_date: datetime) -> str:
        return from_date.strftime("%Y-%m-%d") + '_' + to_date.strftime("%Y-%m-%d") 
    
    
class S3FileReader:
    def __init__(self, client: s3client, bucket: str):
        self.client = client
        self.bucket = bucket
    
    def read(self, file_path: str) -> pd.DataFrame:
        for key in self.client.list_objects(Bucket=self.bucket, Prefix=file_path)['Contents']:
            obj = self.client.get_object(Bucket=self.bucket, Key=key['Key'])
            if key['Key'].endswith('.parquet.gzip'):
                df = pd.read_parquet(BytesIO(obj['Body'].read()))
            else:
                df = pd.read_csv(BytesIO(obj['Body'].read()))
            
        return df
        
    
class DataFrameMapperBySemester:
    def __init__(self, reader: S3FileReader, date_parser: DateParser):
        self.reader = reader
        self.date_parser = date_parser
        
        
    def map_for(self, alg_name: str, from_date: datetime, to_date: datetime) -> pd.DataFrame:
        time_probe = self.date_parser.create_time_probe(from_date, to_date)
        file_path = 'clusters/%s/by_semester/%s.csv' % (alg_name, time_probe)
        
        return self.reader.read(file_path)
              

def get_cluster_overlap(df1: pd.DataFrame, df2: pd.DataFrame):

    m_id = 1
    x_id = 1

    overlap_list = []
    for m_line in m_csv:
        m_set = set(m_line)
        for x_line in x_csv:
            x_set = set(x_line)
            both = m_set & x_set
            combined = m_set | x_set
            m_only = m_set - x_set
            x_only = x_set - m_set
            if len(both) > 0:
                overlap_list.append(
                    [m_id, x_id, len(m_line), len(x_line), len(both), list(both), len(m_only), list(m_only),
                     len(x_only), list(x_only), len(combined), list(combined)])
            x_id += 1
        x_id = 1
        m_id += 1

    overlap_list.sort(key=lambda x: (x[0], -x[3]))
    return overlap_list

In [3]:
year = 2008

s3_reader = S3FileReader(get_default_s3_client(application), 'nanohub.processed')
mapper = DataFrameMapperBySemester(s3_reader, UnderscoredDateParser())

df_m = mapper.map_for('mike', datetime.date(year, 7, 2), datetime.date(year, 12, 31))
df_x = mapper.map_for('xufeng', datetime.date(year, 7, 2), datetime.date(year, 12, 31))

In [4]:
display(len(df_m))
display(df_m.head(2))

45

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,ilin,jjund,s3suggs,schirumamilla,tzack25,xxu,martabme,nashida,brianchang,nlomotan,ltsai,acolletti,rpatel,hyehan,mbenner,gthippeswamy,jle703,strikex2,ceck,svadner,dbasco,dchun,jsoohoo,devish,eklebanov,prajeetreddy,ejen,athompsonharvey,pagrawal10,fch308,ammerocks,achadha,bmaganti,thebombboi123,dtchen,mkk749,cmpark,dannyboy8m,sgrindy,jchu90,nicholas,eko,bsh319,ndavidson,ashah,lywang,nseetharaman1,ngm654,jennie,jkaplan,jglass,rchen,tslayman,aconroy,alberthong,cberg,dmorris,past10,mkramer,tsa309,runfree909,jshepherd,carlherndon2011,jkrupp,alee,emilywirtz,mhuff,relwood,alex_625,msriram,mayabhardwaj,amalik,jtao,ktp618,kbandi,mme118,ltong,bambielli,ywang,tsaraiya,ngg760,ajl131,mbourgeois,rachelrosen,mwise,mchen2012,lixinxin,mgroner,alcinalidder,falcon36,phirbreethr,dkiebala1,ahassan515,mcee2374,shaikal,apeterson,mwalimujim,athuluvath,mshen2012,kjoshi91,noheneadu,jcasady,mongoosehuman,callee,npetlakh,dbtice,lnur,sfedorikhina,krich55,svs470,aogonek,mlangille,hstafford,txu,wchang,skrilov,bveldk06,swolff,cnoble,gpedrick,mwang,tejmotiwala,jacquelinehwang,msimms,ddkim,emj321,jschiaffino,tkl821,dwalco,ktsang,kwd761,annab,jsharp,abc897,mpineda,kgrimestad,astockdale,dcpearsall,luichanco,ecadoff,jerrylin12,yxu,tstanis,kperkins,zschlieder,gwm166,sshaaban,amwang,hkawashima,kwang11,ahuang,spokala,shk2990,shp279,sarahhhong,mellis,bsegal,cmd366,irenezhanglin,avinokur,maroon_09,athompson,wtong,kmobilia,anderpants,craak,camila8,chinkmama,kshepherd,cyarka,fculver,yleong,matosziuk,khaghighi,dianemok,myu,njohnson463,jtrop,thegeoffmeister,kgovindaraju,dpetracovici,laurennelson07,maffi,jkastan,dramadurai,iezzoni21,asklar,skang,allegramount,mdawg777,karlind,hpatel11,mannamalai,rkay,jfang,blindbard393,acoates,ymalina,omnelson,pcarlson
1,jjconnol,bmschult,coxcj,acimarol,mcwatty,bishopad,lbehrens,mjablons,mokomi89,torstenmschreiber89,kirst,mille273,sjnichol,brennanm,jquinlan,bphester1,ipayne,boiler19,mwjensen,bluegreentwo,asankar,jfbrenna,wilsonn,jmknowle,tepowers,christaj,cmcclear982,swan12,zrgeorge,rlass1,meg5994,fashnicked87,jswarner,jones118,rfkoehle,alimiadi,jroush,ssehr,mbenadum,efohl,sajordan,jthancoc,mtoomey,hsale,griffijl,dchung88,nheymer,drgrove,floydm,alentner12,akremer,mduffey,svpeders,bcnewman,clyons,heginator16,tfigg,holta,duktaip26,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
dft_m = df_m.transpose()

#display(dft_m)

In [6]:
display(len(df_x))
display(df_x.head(2))

90

Unnamed: 0,nvargo,sdhuang,psengupta,simonxu,kuntalroy,jrwilcox,cplacek,makang,ddatta,spalit,zainu,jtsmith,cinedemian,yulin,wgriffin,masud001,cooperj,gaoyunfei,lundstro,niladrinm,kim604,fbowen,hmarkand,liveletlive,sokrates,biswajit025,xfong,dhanmaria,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114,Unnamed: 115,Unnamed: 116,Unnamed: 117,Unnamed: 118,Unnamed: 119,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150,Unnamed: 151,Unnamed: 152,Unnamed: 153,Unnamed: 154,Unnamed: 155,Unnamed: 156,Unnamed: 157,Unnamed: 158,Unnamed: 159,Unnamed: 160,Unnamed: 161,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171,Unnamed: 172,Unnamed: 173,Unnamed: 174,Unnamed: 175,Unnamed: 176,Unnamed: 177,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183,Unnamed: 184,Unnamed: 185,Unnamed: 186,Unnamed: 187,Unnamed: 188,Unnamed: 189,Unnamed: 190,Unnamed: 191,Unnamed: 192,Unnamed: 193,Unnamed: 194,Unnamed: 195,Unnamed: 196,Unnamed: 197,Unnamed: 198,Unnamed: 199,Unnamed: 200,Unnamed: 201,Unnamed: 202,Unnamed: 203,Unnamed: 204
0,onurucler,cinedemian,pachen,afrankl,shen17,sunnyleekr,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,nvargo,sdhuang,psengupta,jiuning,simonxu,awhaas,kuntalroy,jrwilcox,cplacek,makang,spalit,gpanagop,zainu,wizjeong,pachen,lundstro,shen17,sunnyleekr,kim604,hmarkand,pjha1,tanyaohua,ssalamat,biswajit025,xfong,sokrates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
dft_x = df_x.transpose()

#display(dft_x)

In [8]:
ds_m = set(map(tuple, dft_m.values))
ds_x = set(map(tuple, dft_x.values))

In [9]:
display(pd.DataFrame(list(ds_x.intersection(ds_m))))

In [10]:
display(pd.DataFrame(list(ds_m.difference(ds_x))))
display(pd.DataFrame(list(ds_x.difference(ds_m))))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
0,emilywirtz,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,thebombboi123,jones118,ee21yin,ejimenez,kjs1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,anderpants,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,yxu,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,bsegal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,xxu,bishopad,towster2,jelopez2,mathew,hkb,s052429,shwethakuruganti,chucknorris,dianap1986,dmei,modave,tigranb,patterso09,warz00,gnudi,pythais,xwang308,sdidla,divsnvik,jakester2481,quarkn,krishnan_badri,sanvampire74,triky,sshah7,wwz12,kurstjen,OsRitmos,potax,darora7,superego,clintone,Pourghaz,,,,,,,,,,,
6,carlherndon2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,ilin,jjconnol,msachtl2,amiramontes,bniu,smeagul,marsmand,mohan,philippschaefers,danybaret,mkmk2009,redorange1984,jtli3,yootuka,psavkar,tmacelwee,fatihertinaz,kevinhaghighi2012,AllWires,divya,acwatkins88,yqlong,hemanthaithal,askiahill,cinzia_nanna,mattfred,adolfo,juandako,ageggles,jdustin01,hpvbs,anestisk,axavier,pawelkaczmarek,alayi,vcl,yoshiki_kamata,sougatapahari,brandonmc,ajasja,sathyans,jlbaker4,saeideh,xnlepro,libtech
8,nicholas,sajordan,hrshah2,ruls_7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,jacquelinehwang,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89
0,,,,,,,,,,ndavidson,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,dannyboy8m,,,,,,,,,,,,,,,,mlangille,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,npetlakh,,,,,,,,,,,,bniu,,,,,holta,,,,,,,dhanmaria,,,,,,liveletlive,,,,,,,,,,,,bsegal,,,,,,,,,,,,,,,,bsegal,,,,,,,,,,,,,,,,dvarghe,,,raseong,,,
2,,,,,,,,,,lywang,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,blindbard393,,,,,,,,,,,,,,,,mayabhardwaj,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,fch308,,,,,,,,,,,,,,,,fch308,,,,,,,,,,,,,,,,,,,kennell,,,
4,,,,,,,,,,mdawg777,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,kgrimestad,,,,,,,,,,,,,,,,kgrimestad,,,,,,,,,,,,,,,,,,,park43,,,
5,,,,,,,,,,zschlieder,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ammerocks,,,,,,,,,,,,,,,,ammerocks,,,,,,,,,,,,,,,,,,,purdude,,,
6,,,,,,,,,,wchang,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,,,,xni,,,
7,,,,,,,,,,bsh319,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,dcho0132,,,,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,jcasady,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,jacquelinehwang,,,,,,,,,,,,,,,,,,,,,,
9,,,,,,,,,,gthippeswamy,,,,,,,,,,,,pkoswatt,,,,,jswarner,,,,,,,,,,,,,go,,,,,,,,,,,,cberg,,,,,,,,,,,,,,,,cberg,,,,,,,,,,,,,,,,ricktest,,,johnsosd,,,


In [11]:
display(pd.DataFrame(list(ds_m.union(ds_x))))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89
0,,,,,,,,,,ndavidson,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,dannyboy8m,,,,,,,,,,,,,,,,mlangille,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,fch308,,,,,,,,,,,,,,,,fch308,,,,,,,,,,,,,,,,,,,kennell,,,
2,anderpants,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,bsegal,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,zschlieder,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ammerocks,,,,,,,,,,,,,,,,ammerocks,,,,,,,,,,,,,,,,,,,purdude,,,
5,,,,,,,,,,wchang,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,jschiaffino,,,,,,,,,,,,,,,,,,,xni,,,
6,ilin,jjconnol,msachtl2,amiramontes,bniu,smeagul,marsmand,mohan,philippschaefers,danybaret,mkmk2009,redorange1984,jtli3,yootuka,psavkar,tmacelwee,fatihertinaz,kevinhaghighi2012,AllWires,divya,acwatkins88,yqlong,hemanthaithal,askiahill,cinzia_nanna,mattfred,adolfo,juandako,ageggles,jdustin01,hpvbs,anestisk,axavier,pawelkaczmarek,alayi,vcl,yoshiki_kamata,sougatapahari,brandonmc,ajasja,sathyans,jlbaker4,saeideh,xnlepro,libtech,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,nicholas,sajordan,hrshah2,ruls_7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,phirbreethr,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,jshepherd,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:


time_probe = create_time_probe(from_date, to_date)
print(time_probe)

for alg_type in alg_types:
    file_path = 'clusters/%s/by_semester/%s.csv' % (alg_type, time_probe)
    for key in s3_client.list_objects(Bucket='nanohub.processed', Prefix=file_path)['Contents']:
        obj = s3_client.get_object(Bucket='nanohub.processed', Key=key['Key'])
        if key['Key'].endswith('.parquet.gzip'):
            df = pd.read_parquet(BytesIO(obj['Body'].read()))
        else:
            df = pd.read_csv(BytesIO(obj['Body'].read()), header=None)
    
    display(alg_type)
    display(df.head())
    df.columns = np.arange(0,len(df.columns))
    display(df.head())

print('#########################################################################################################')
print('#########################################################################################################')

NameError: name 'create_time_probe' is not defined