In [8]:
from collections import defaultdict, OrderedDict
from itertools import product

testd = dict(test=[3], bla=[4], blabla=[42, 43, 44])
for i in product(*testd.values()):
    print(i)

(3, 4, 42)
(3, 4, 43)
(3, 4, 44)


In [9]:
testd.values()

dict_values([[3], [4], [42, 43, 44]])

In [11]:
from dataclasses import dataclass
from typing import List

@dataclass
class TestSettings:
    n_comps: List[int]
    mix_ratio: List[float]
    umap_metric: List[str]
    min_cluster_size: List[int]
        
standard_test = TestSettings([3, 15, 45, 200], [0.0, 0.3, 0.15, 0.4], ["cosine"], [15, 45, 90])

standard_test

TestSettings(n_comps=[3, 15, 45, 200], mix_ratio=[0.0, 0.3, 0.15, 0.4], umap_metric=['cosine'], min_cluster_size=[15, 45, 90])

In [28]:
standard_test.__dict__.values()

dict_values([[3, 15, 45, 200], [0.0, 0.3, 0.15, 0.4], ['cosine'], [15, 45, 90]])

In [30]:
for i in product(*standard_test.__dict__.values()):
    print(i)

(3, 0.0, 'cosine', 15)
(3, 0.0, 'cosine', 45)
(3, 0.0, 'cosine', 90)
(3, 0.3, 'cosine', 15)
(3, 0.3, 'cosine', 45)
(3, 0.3, 'cosine', 90)
(3, 0.15, 'cosine', 15)
(3, 0.15, 'cosine', 45)
(3, 0.15, 'cosine', 90)
(3, 0.4, 'cosine', 15)
(3, 0.4, 'cosine', 45)
(3, 0.4, 'cosine', 90)
(15, 0.0, 'cosine', 15)
(15, 0.0, 'cosine', 45)
(15, 0.0, 'cosine', 90)
(15, 0.3, 'cosine', 15)
(15, 0.3, 'cosine', 45)
(15, 0.3, 'cosine', 90)
(15, 0.15, 'cosine', 15)
(15, 0.15, 'cosine', 45)
(15, 0.15, 'cosine', 90)
(15, 0.4, 'cosine', 15)
(15, 0.4, 'cosine', 45)
(15, 0.4, 'cosine', 90)
(45, 0.0, 'cosine', 15)
(45, 0.0, 'cosine', 45)
(45, 0.0, 'cosine', 90)
(45, 0.3, 'cosine', 15)
(45, 0.3, 'cosine', 45)
(45, 0.3, 'cosine', 90)
(45, 0.15, 'cosine', 15)
(45, 0.15, 'cosine', 45)
(45, 0.15, 'cosine', 90)
(45, 0.4, 'cosine', 15)
(45, 0.4, 'cosine', 45)
(45, 0.4, 'cosine', 90)
(200, 0.0, 'cosine', 15)
(200, 0.0, 'cosine', 45)
(200, 0.0, 'cosine', 90)
(200, 0.3, 'cosine', 15)
(200, 0.3, 'cosine', 45)
(200, 0.3, 'co

In [39]:
@dataclass
class TestData:
    path: str
    name: str
    fraction: List[float]
    contamination: List[float]
    seed: List[int]
        
imdb_20news_3splits = TestData(
    ["/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl"], ["imdb_20news"], [0.15], [0.1], [42, 43, 44])

for i in zip(imdb_20news_3splits.__dict__,imdb_20news_3splits.__dict__.values()):
    print(i)

('path', ['/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl'])
('name', ['imdb_20news'])
('fraction', [0.15])
('contamination', [0.1])
('seed', [42, 43, 44])


In [35]:
for i in product(*imdb_20news_3splits.__dict__.values()):
    print(i)

('/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl', 'imdb_20news', 0.15, 0.1, 42)
('/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl', 'imdb_20news', 0.15, 0.1, 43)
('/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl', 'imdb_20news', 0.15, 0.1, 44)


In [57]:
Training data%%time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def get_outlier_data(oe_path, n_oe, seed):
    df_oe = pd.read_pickle(oe_path)
    df_oe = df_oe.iloc[np.random.RandomState(
        seed=seed).permutation(len(df_oe))].head(n_oe)
    df_oe["label"], df_oe["outlier_label"], df_oe["scorable"] = 0, -1, 0
    return df_oe


def label_data(df, seed, labeled_data, outlier_classes):
    df = df[["text", "target", "vecs"]]
    df["scorable"] = 1
    # get all 20 news data
    df = df.where(df.target != -1).dropna()
    # set everything except one class to inlier
    print("Create column outlier_label")
    df["outlier_label"] = -1
    df.loc[~df.target.isin(outlier_classes), "outlier_label"] = 1
    # create labels for UMAP and ivis that
    # are 0 and 1 (derived from the just created outlier labels)
    df["label"] = (df["outlier_label"]+1)/2
    # stratified sample and set unlabeled data based on labeled_data variable
    print("Sample labeled data df")
    df_unlabeled = df[['target']].groupby('target', group_keys=False).apply(
        lambda x: x.sample(frac=1-labeled_data, random_state=seed))
    
    print(df_unlabeled.index)
    df.loc[df_unlabeled.index, "label"] = -1
    print("Data before split:\n")
    print(df.groupby(['label', 'outlier_label']).size(
    ).reset_index().rename(columns={0: 'count'}), "\n")
    return df


def prepare_data(df, outliers, inliers, seed, fixed_cont,
                 labeled_data, n_oe, test_size, oe_path, doc2vec_model, **kwargs):
    print("Only use classes that are in inliers or outliers")
    df = df.where(df.target.isin(
        outliers+inliers)).dropna()
    # label data as inliers and outliers (for scoring) and whether
    # they have labels or not (semi-supervised)
    df = label_data(df, seed, labeled_data, outliers)

    if fixed_cont:
        df = sample_data(df, 1.0, fixed_cont, seed)
        print("Data after adjusting for fixed contamination:\n")
        print(df.groupby(['label', 'outlier_label']).size(
        ).reset_index().rename(columns={0: 'count'}), "\n")

    if n_oe:
        df_oe = get_outlier_data(oe_path, n_oe, seed)
        #df_oe["vecs"] = doc2vec_model.vectorize(df_oe["text"])

    # split train test
    df, df_test = train_test_split(df,
                                   test_size=test_size, random_state=seed,
                                   stratify=df["outlier_label"])
    if n_oe:
        df = df.append(df_oe)

    if -1 in df.label.unique() and df.label.value_counts()[-1] != df.shape[0]:
        if df[(df.label == 0) & (df.outlier_label == -1)].shape[0] == 0:
            print("Adding missing sample for labeled outlier")
            df.loc[((df.label == -1) & (df.outlier_label == -1)
                    ).idxmax(), 'label'] = 0

    print("Training data:\n", df.groupby(['label', 'outlier_label']).size(
    ).reset_index().rename(columns={0: 'count'}), "\n\n")
    print("Test data:\n", df_test.groupby(['label', 'outlier_label']).size(
    ).reset_index().rename(columns={0: 'count'}), "\n\n")

    return df, df_test

def sample_data(df, fraction, contamination, seed):
    X_n = int(df[df.outlier_label==1].shape[0] * fraction)
    y_n = int(X_n * contamination)

    df = df.iloc[np.random.RandomState(seed=seed).permutation(len(df))]
    df = df[df["outlier_label"] == 1].head(X_n).append(
        df[df["outlier_label"] == -1].head(y_n))
    df = df.reset_index(drop=True)
    return df

#data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb_vec.pkl"
data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl"

n_classes = [20000]
n_class = n_classes[0]

params = dict(
    seed=42,
    test_size=0.2,
    labeled_data=0.5,
    fixed_cont=0.1,
    n_oe=False,
    use_nn=True,
    inliers=[0, 1, 2, 11],
    outliers=  [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15]
)

# load data and get the doc2vec vectors for all of the data used
df_full = pd.read_pickle(data_path)

df_full = df_full.groupby('target', group_keys=False).apply(
    lambda df: df.sample(n=max(n_classes), random_state=42))

df_full["vecs"] = df_full.target.map(lambda x: np.array([1, 2, 3]))
df_full.vecs = df_full.vecs.apply(tuple)


# sample only a portion of the data
df_partial = df_full.groupby('target', group_keys=False).apply(
    lambda df: df.sample(n=n_class, random_state=42))

df, df_test = prepare_data(
                df_partial, oe_path=oe_path, doc2vec_model=None, **params)

df.target.value_counts()

Only use classes that are in inliers or outliers
Create column outlier_label
Sample labeled data df
Merge labeled data df with full df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Label the outliers as -1 in output df
Data before split:

   label  outlier_label  count
0   -1.0             -1  11469
1   -1.0              1   3987 

Data after adjusting for fixed contamination:

   label  outlier_label  count
0   -1.0             -1    398
1   -1.0              1   3987 

Training data:
    label  outlier_label  count
0   -1.0             -1    318
1   -1.0              1   3190 


Test data:
    label  outlier_label  count
0   -1.0             -1     80
1   -1.0              1    797 


CPU times: user 2min 7s, sys: 18.9 s, total: 2min 26s
Wall time: 4min 21s


2     808
1     807
0     795
11    780
8      34
14     33
3      31
9      28
10     26
4      26
13     24
12     24
7      24
6      24
15     23
5      21
Name: target, dtype: int64

In [34]:
Training datadf_test.target.value_counts()

1     211
0     201
2     196
11    192
9      12
8      12
13      9
4       8
15      7
5       7
14      6
12      5
7       5
10      4
3       3
6       2
Name: target, dtype: int64

In [5]:
import pandas as pd

data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl"
df = pd.read_pickle(data_path)

df

Unnamed: 0,filename,target,split,text
0,imagesr/r/g/e/rge31d00/503210033+-0034.txt,3,test,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
1,imagesc/c/e/j/cej80d00/517306722+-6724.txt,3,test,\nZ Like to Have the Zippo Gia)\n~ SS Pack in ...
2,imagesm/m/r/r/mrr36d00/50603620-3621.txt,14,test,CO-INVESTIGATOR\n\nNAME POSITION/TITLE BIRTHDA...
3,imagesg/g/t/u/gtu29c00/2084573574a.txt,2,test,"Original Message\nFrom Wile, Vivian\nMonday, A..."
4,imagesh/h/o/f/hof08d00/2071783492.txt,9,test,04/18/98 09:37 G1 7+338-8886 MCKAY FRIED Boooz...
...,...,...,...,...
39995,imageso/o/u/k/ouk93f00/0013006838.txt,10,val,The Tobacco Institute\nIndustry Support of Bio...
39996,imagesf/f/f/b/ffb52c00/2074103881.txt,11,val,‘a.coweames\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
39997,imagesg/g/h/b/ghb11f00/0001251052.txt,15,val,ATION\nINTIAL MINNESOTA TOBACCO LITIG\nCONFIDE...
39998,imagesl/l/c/k/lck71f00/2016003416.txt,9,val,"\nTHE WASHINGTON STAR\nSaturday, August 3, 196..."


In [6]:
%%time
df["text_len"] = df.text.map(lambda x:len(x))

CPU times: user 174 ms, sys: 3.98 ms, total: 178 ms
Wall time: 177 ms


In [24]:
df.where(df.text_len.between(0,300)).dropna().target.value_counts()

8     24783
4     18870
3     15963
12     9733
2      6934
10     5902
5      5288
9      4645
11     4644
1      3295
13     3225
6      1895
15     1658
7      1372
0      1333
14      596
Name: target, dtype: int64

In [34]:
print(df.shape[0])
df.where(df.text_len.between(0,50)).dropna().reset_index(drop=True)

399999


Unnamed: 0,filename,target,split,text,text_len
0,imagesr/r/g/e/rge31d00/503210033+-0034.txt,3,test,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,22.0
1,imagesx/x/a/b/xab71f00/1002977593_1002977622.txt,6,test,\n,1.0
2,imagesj/j/t/o/jto61f00/2050283643.txt,8,test,"Suisy? Wals\n\n\n2050283643,",25.0
3,imageso/o/f/x/ofx31e00/85330046.txt,8,test,,0.0
4,imageso/o/g/y/ogy01d00/517234514+-4517.txt,3,test,vise ezers\n\n\n\n,14.0
...,...,...,...,...,...
46185,imagesy/y/t/p/ytp33f00/0013040253.txt,10,val,\ntesak\n\n536101556\n\nerst aros\n,29.0
46186,imagesn/n/l/x/nlx33e00/2044743126.txt,8,val,,0.0
46187,imagesu/u/x/x/uxx79e00/0000182112.txt,13,val,,0.0
46188,imagesx/x/y/l/xyl40f00/0000340757.txt,4,val,\n285520069,10.0


Unnamed: 0,filename,target,split,text,text_len
0,imagesr/r/g/e/rge31d00/503210033+-0034.txt,3,test,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,22
1,imagesc/c/e/j/cej80d00/517306722+-6724.txt,3,test,\nZ Like to Have the Zippo Gia)\n~ SS Pack in ...,160
2,imagesm/m/r/r/mrr36d00/50603620-3621.txt,14,test,CO-INVESTIGATOR\n\nNAME POSITION/TITLE BIRTHDA...,2039
4,imagesh/h/o/f/hof08d00/2071783492.txt,9,test,04/18/98 09:37 G1 7+338-8886 MCKAY FRIED Boooz...,376
5,imagesx/x/a/b/xab71f00/1002977593_1002977622.txt,6,test,\n,1
...,...,...,...,...,...
39994,imagesw/w/n/k/wnk17e00/2031318247.txt,7,val,\n\n\n\n\n\n\n? poursyruty Affective Date: Oct...,1183
39995,imageso/o/u/k/ouk93f00/0013006838.txt,10,val,The Tobacco Institute\nIndustry Support of Bio...,1027
39997,imagesg/g/h/b/ghb11f00/0001251052.txt,15,val,ATION\nINTIAL MINNESOTA TOBACCO LITIG\nCONFIDE...,859
39998,imagesl/l/c/k/lck71f00/2016003416.txt,9,val,"\nTHE WASHINGTON STAR\nSaturday, August 3, 196...",1672
