-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_subsets_for_common_x.py
75 lines (61 loc) · 2.71 KB
/
generate_subsets_for_common_x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import time
import sys
from os.path import isfile, join
import numpy as np
import pandas as pd
from data_keeper import get_data_keeper
from generate_subsets import SubsetGenerator
from wrappers import SubsetGeneratorWrapper
RAW_X_BEFORE_SUBSET_GENERATION_PATH = "raw_X_before_subsets_generation.csv"
POSSIBLE_COMPLEX_FEATURES_PATH = "possible_complex_features.txt"
get_generator_result = None
def make_new_generator():
start_time = time.time()
X = get_data_keeper().get_common_x()
print "matrix shape before:", X.shape
X[X!=1] = 0
to_drop = (X.sum(axis=0) >= (X.shape[0] / 2)) | (X.sum(axis=0) < 3)
to_drop = to_drop[to_drop].index
X = X.drop(to_drop, axis=1)
X.to_csv(RAW_X_BEFORE_SUBSET_GENERATION_PATH)
print "matrix shape after:", X.shape
sys.stdout.flush()
generator = SubsetGenerator()
generator.generate_and_set(X.as_matrix().astype(np.uint8))
print "generating done, time from start spent:", time.time() - start_time
generator.store(POSSIBLE_COMPLEX_FEATURES_PATH)
print "storing done, time from start spent:", time.time() - start_time
return generator, X
def get_ready_generator_inner(compute_if_not_found=True, folder=None):
global get_generator_result
if get_generator_result is None:
if folder is None:
raw_X_before_subsets_generation_path = RAW_X_BEFORE_SUBSET_GENERATION_PATH
possible_complex_features_path = POSSIBLE_COMPLEX_FEATURES_PATH
else:
raw_X_before_subsets_generation_path = join(folder, RAW_X_BEFORE_SUBSET_GENERATION_PATH)
possible_complex_features_path = join(folder, POSSIBLE_COMPLEX_FEATURES_PATH)
if isfile(raw_X_before_subsets_generation_path) and isfile(possible_complex_features_path):
generator = SubsetGenerator()
generator.load(possible_complex_features_path)
X = pd.read_csv(raw_X_before_subsets_generation_path, index_col=0)
generator.set_raw_matrix(X.as_matrix().astype(np.uint8))
get_generator_result = generator, X
else:
if compute_if_not_found:
get_generator_result = make_new_generator()
return get_generator_result
class GeneratorGetter(object):
def __init__(self, *args, **kwargs):
self._args = args
self._kwargs = kwargs
def __call__(self):
return get_ready_generator_inner(*self._args, **self._kwargs)[0]
def __getstate__(self):
return self.__dict__.copy()
def __setstate__(self, state):
self.__dict__ = state.copy()
def get_ready_generator(*args, **kwargs):
generator, X = get_ready_generator_inner(*args, **kwargs)
return SubsetGeneratorWrapper(GeneratorGetter()), X
__all__ = ['get_ready_generator']