In [1]:
import requests
from sklearn.datasets import load_svmlight_file
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np
import arff
import bz2

### DNA

In [6]:
urls_dict = {"train":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.tr",
                "val":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.val",
                "test":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/dna.scale.t"}
x_total = np.array([]).reshape(0, 180)
y_total = np.array([]).reshape(0, 3)
for split_name, url in urls_dict.items():
    file_path = f"dna/dna_{split_name}_raw"
    x, y  = load_svmlight_file(file_path, n_features=180)
    x = np.asarray(x.todense(), dtype=np.float32)
    y = y.reshape(-1, 1).astype(np.float32)
    # pre-process the labels
    encoder = OneHotEncoder(sparse_output=False)
    y = encoder.fit_transform(y)
    x_total = np.vstack((x_total, x))
    y_total = np.vstack((y_total, y))

In [7]:
print(x_total.shape)
print(y_total.shape)

(3186, 180)
(3186, 3)


In [9]:
np.savez("dna/dna.npz", x=x_total, y=y_total)

In [10]:
with np.load("dna/dna.npz", allow_pickle=True) as file:
    x = file["x"].astype(np.float32)
    y = file["y"].astype(np.float32)

In [11]:
print(x.shape)
print(y.shape)

(3186, 180)
(3186, 3)


### Splice

In [2]:
urls_dict = {"train":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice",
            "test": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice.t"}

x_total = np.array([]).reshape(0, 60)
y_total = np.array([]).reshape(0, 2)
for split_name, url in urls_dict.items():
    file_path = f"splice/splice_{split_name}_raw"
    with open(file_path, 'w') as f:
        r = requests.get(url)
        f.writelines(r.content.decode("utf-8"))
    x, y  = load_svmlight_file(file_path, n_features=60)
    x = np.asarray(x.todense(), dtype=np.float32)
    y = y.reshape(-1, 1).astype(np.float32)
    # pre-process the labels
    encoder = OneHotEncoder(sparse_output=False)
    y = encoder.fit_transform(y)
    # pre-process the features
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    x_total = np.vstack((x_total, x))
    y_total = np.vstack((y_total, y))

In [3]:
print(x_total.shape)
print(y_total.shape)

(3175, 60)
(3175, 2)


In [4]:
np.savez("splice/splice.npz", x=x_total, y=y_total)

### Protein

In [15]:
urls_dict = {"t":"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/protein.tr.bz2",
            "tr": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/protein.t.bz2",
            "val": "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/protein.val.bz2"}

x_total = np.array([]).reshape(0, 357)
y_total = np.array([]).reshape(0, 3)
for split_name, url in urls_dict.items():
    file_path = f"protein/protein.{split_name}"
    x, y  = load_svmlight_file(file_path, n_features=357)
    x = np.asarray(x.todense(), dtype=np.float32)
    y = y.reshape(-1, 1).astype(np.float32)
    # pre-process the labels
    encoder = OneHotEncoder(sparse_output=False)
    y = encoder.fit_transform(y)
    # pre-process the features
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    x_total = np.vstack((x_total, x))
    y_total = np.vstack((y_total, y))

In [16]:
print(x_total.shape)
print(y_total.shape)

(24387, 357)
(24387, 3)


In [17]:
np.savez("protein/protein.npz", x=x_total, y=y_total)

### Electicity dataset

In [19]:
df = pd.read_csv("electricalFault/detect_dataset.csv", header=0, index_col=None)
df.drop(columns=['Unnamed: 7', 'Unnamed: 8'], inplace=True)
df.rename(columns={'Output (S)': 'target'}, inplace=True)
scaler = MinMaxScaler()
df[df.columns.difference(['target'])] = scaler.fit_transform(df[df.columns.difference(['target'])])
np.savez("electricalFault/detect.npz", x=df[df.columns.difference(['target'])].values, y=df['target'].values)

### pokerdataset

In [21]:
df = pd.read_csv("pokerdataset/poker-hand-training.csv", header=0, index_col=None)
df.rename(columns={
    'Poker Hand': 'target'
}, inplace=True)
np.savez("pokerdataset/poker.npz", x=df[df.columns.difference(['target'])].values, y=df['target'].values)

### phoneme

The aim of this dataset is to distinguish between nasal (class 0) and oral sounds (class 1). Five different attributes were chosen to characterize each vowel: they are the amplitudes of the five first harmonics AHi, normalised by the total energy Ene (integrated on all the frequencies): AHi/Ene. The phonemes are transcribed as follows: sh as in she, dcl as in dark, iy as the vowel in she, aa as the vowel in dark, and ao as the first vowel in water.

In [13]:
with open('phoneme/phoneme.arff', 'r') as f:
    dataset = arff.load(f)

# Display the dataset
dataset = np.array(dataset['data'])
x = dataset[:, :-1].astype(np.float32)
y = dataset[:, -1].astype(np.int32)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
np.savez("phoneme/phoneme.npz", x=x, y=y)

### ozone-level-8hr

In [18]:
with open('ozone-level-8hr/ozone-level-8hr.arff', 'r') as f:
    dataset = arff.load(f)

dataset = np.array(dataset['data'])
# Display the dataset
x = dataset[:, :-1].astype(np.float32)
y = dataset[:, -1].astype(np.int32)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
np.savez("ozone-level-8hr/ozone-level-8hr.npz", x=x, y=y)

### spambase

The "spam" concept is diverse: advertisements for products/websites, make money fast schemes, chain letters, pornography... Our collection of spam e-mails came from our postmaster and individuals who had filed spam. Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word 'george' and the area code '650' are indicators of non-spam. These are useful when constructing a personalized spam filter. One would either have to blind such non-spam indicators or get a very wide collection of non-spam to generate a general purpose spam filter.



In [22]:
with open('spambase/spambase.arff', 'r') as f:
    dataset = arff.load(f)
dataset = np.array(dataset['data'])
# Display the dataset
x = dataset[:, :-1].astype(np.float32)
y = dataset[:, -1].astype(np.int32)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
np.savez("spambase/spambase.npz", x=x, y=y)

### kc1


In [29]:
with open('kc1/kc1.arff', 'r') as f:
    dataset = arff.load(f)
dataset = np.array(dataset['data'])
x = dataset[:, :-1].astype(np.float32)
y = dataset[:, -1]
y = np.where(y == b'true', 1, 0).astype(np.int32)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
np.savez("kc1/kc1.npz", x=x, y=y)