In [2]:
import pandas as pd

In [5]:
def prep(file):
    df = pd.read_csv(file)
    df=df.rename(columns={"inds of samples containing concept":"iou"})
    df = df.drop(["formula", "num samples containing concept"], axis=1)
    df=df.drop_duplicates()
    return df

def get_percent_dif(run1, run2, run3):
    pairs = [(run1, run2), (run1, run3), (run2, run3)]
    index2key={0:"Dif Between Run 0 and 1", 1:"Dif Between Run 1 and 2", 2:"Dif Between Run 2 and 3" }
    dif_percent = {"Dif Between Run 0 and 1":0 , "Dif Between Run 1 and 2":0, "Dif Between Run 2 and 3":0}
    for i,pair in enumerate(pairs):
        int_runArunB_df = pd.merge(pair[0], pair[1])
        union_runArunB_df = pd.concat([pair[0], pair[1]])
        int_runArunB_df=int_runArunB_df.drop_duplicates()
        union_runArunB_df=union_runArunB_df.drop_duplicates()
        sim=len(int_runArunB_df)/len(union_runArunB_df)
        dif_percent[index2key[i]] = (1-sim)*100
    return dif_percent


run_0_clus_1 = prep("Run0Cluster1IOUs.csv")
run_1_clus_1 = prep("Run1Cluster1IOUs.csv")
run_2_clus_1 = prep("Run2Cluster1IOUs.csv")
run_0_clus_2 = prep("Run0Cluster2IOUs.csv")
run_1_clus_2 = prep("Run1Cluster2IOUs.csv")
run_2_clus_2 = prep("Run2Cluster2IOUs.csv")
run_0_clus_3 = prep("Run0Cluster3IOUs.csv")
run_1_clus_3 = prep("Run1Cluster3IOUs.csv")
run_2_clus_3 = prep("Run2Cluster3IOUs.csv")
run_0_clus_4 = prep("Run0Cluster4IOUs.csv")
run_1_clus_4 = prep("Run1Cluster4IOUs.csv")
run_2_clus_4 = prep("Run2Cluster4IOUs.csv")


print("Cluster 1")
l = get_percent_dif(run_0_clus_1, run_1_clus_1, run_2_clus_1)
print(l)
print("Cluster 2")
l = get_percent_dif(run_0_clus_2, run_1_clus_2, run_2_clus_2)
print(l)
print("Cluster 3")
l = get_percent_dif(run_0_clus_3, run_1_clus_3, run_2_clus_3)
print(l)
print("Cluster 4")
l = get_percent_dif(run_0_clus_4, run_1_clus_4, run_2_clus_4)
print(l)


Cluster 1
{'Dif Between Run 0 and 1': 4.271806640491405, 'Dif Between Run 1 and 2': 4.271806640491405, 'Dif Between Run 2 and 3': 0.0}
Cluster 2
{'Dif Between Run 0 and 1': 0.0, 'Dif Between Run 1 and 2': 0.0, 'Dif Between Run 2 and 3': 0.0}
Cluster 3
{'Dif Between Run 0 and 1': 0.0, 'Dif Between Run 1 and 2': 0.0, 'Dif Between Run 2 and 3': 0.0}
Cluster 4
{'Dif Between Run 0 and 1': 0.0, 'Dif Between Run 1 and 2': 0.0, 'Dif Between Run 2 and 3': 0.0}


In [19]:
def get_avg_max(p):
    sum = 0
    for iou in p['inds of samples containing concept']:
      sum += iou

    avg = sum/len(p['inds of samples containing concept'])
    m = max(p['inds of samples containing concept'])
    return avg, m

In [26]:
df= pd.read_csv("Cluster2IOUs.csv")
avg, m  = get_avg_max(df)
avg

0.06714021754129847

In [25]:
df= pd.read_csv("Cluster2IOUs_withoutAnySpecTok.csv")
avg, m  = get_avg_max(df)
avg

0.0717349084723516

In [17]:
files = ["Cluster2IOUs_withoutAnySpecTok.csv", "Cluster3IOUs.csv","Cluster4IOUs.csv", "Cluster5IOUs.csv"]
df= pd.read_csv(file[1])
df.unit.unique()

array([  15,  515,  615,  715, 1023])

In [12]:
df= pd.read_csv(file[2])
df.unit.unique()

array([  15,  615,  715, 1023])

In [6]:
df= pd.read_csv("Run0Cluster1IOUs.csv")
print("Cluster 1: ", df['unit'].unique())
df= pd.read_csv("Run0Cluster2IOUs.csv")
print("Cluster 2: ", df['unit'].unique())
df= pd.read_csv("Run0Cluster3IOUs.csv")
print("Cluster 3: ", df['unit'].unique())
df= pd.read_csv("Run0Cluster4IOUs.csv")
print("Cluster 4: ", df['unit'].unique())

Cluster 1:  [  15  215  275  375  395  435  495  515  575  615  675  715  775  895
 1023]
Cluster 2:  [  15  375  395  435  495  515  575  615  675  715  775 1023]
Cluster 3:  [  15  375  395  515  575  615  715  775 1023]
Cluster 4:  [  15  375  575  615 1023]


In [9]:
df= pd.read_csv("Run1Cluster1IOUs.csv")
print("Cluster 1: ", df['unit'].unique())

Cluster 1:  [  15  215  275  375  395  435  495  515  575  615  675  715  775  895
 1023]


In [21]:
import csv
with open("IOUAnalysis.csv", "w") as fp:
    wr = csv.writer(fp, dialect='excel')
    wr.writerow(["units", "avg_iou", "max_iou"])
    for file in files:
        df = pd.read_csv(file)
        units = df.unit.unique()
        avg, m  = get_avg_max(df)
        wr.writerow([str(units), avg, m])
    