In [21]:
# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [20]:
%%time
# ========================
# Generating test datasets
# ========================
# X
iris = load_iris()
X_iris = pd.DataFrame(
    data=iris.data, 
    index=[*map(lambda i:f"iris_{i}",range(iris.data.shape[0]))],
    columns=[*map(lambda x: x.split(" (cm)")[0].replace(" ","_"), iris.feature_names)]
)

# y
y_iris = pd.Series(iris.target, index=X_iris.index, name="species").map(lambda i:iris.target_names[i])

# ========================
# Save data
# ========================
#tsv
X_iris.to_csv("../soothsayer/tests/data/X_iris.tsv", sep="\t")
y_iris.to_frame().to_csv("../soothsayer/tests/data/y_iris.labels.tsv.gz", sep="\t", compression="gzip")

# gzipped tsv
X_iris.to_csv("../soothsayer/tests/data/X_iris.tsv.gz", sep="\t", compression="gzip")
# bz2 pickle
X_iris.to_pickle("../soothsayer/tests/data/X_iris.pbz2",  compression="bz2")
# gzipped pickle
X_iris.to_pickle("../soothsayer/tests/data/X_iris.pgz",  compression="gzip")
# pickle
X_iris.to_pickle("../soothsayer/tests/data/X_iris.pkl",  compression=None)
# excel
writer = pd.ExcelWriter("../soothsayer/tests/data/iris.xlsx")
X_iris.to_excel(writer, sheet_name="X_iris")
y_iris.to_frame().to_excel(writer, sheet_name="y_iris")
writer.save()

CPU times: user 97.6 ms, sys: 4.11 ms, total: 102 ms
Wall time: 104 ms


In [33]:
%%time
# Add noise data
def add_noise_to_data(X, n_attrs_total=100, random_state=0, concat=True, **kws):
    assert n_attrs_total > X.shape[1], f"{n_attrs_total} < X.shape[1] = {X.shape[1]}"
    n_attrs_to_add = n_attrs_total - X.shape[1]
    df_noise = pd.DataFrame(
        data = np.random.RandomState(random_state).normal(size=(X.shape[0], n_attrs_to_add), **kws),
        index=X.index,
        columns=[*map(lambda j:f"noise_{j}", range(n_attrs_to_add))]
    )
    if concat:
        return pd.concat([X, df_noise], axis=1)
    else:
        return df_noise
    
# 100 total attributes
X_iris_noise = add_noise_to_data(X_iris).abs()
X_iris.to_csv("../soothsayer/tests/data/X_iris.noise_100.tsv.gz", sep="\t", compression="gzip")

CPU times: user 6.08 ms, sys: 2.24 ms, total: 8.32 ms
Wall time: 7.62 ms
