# Cleaning up of datasets for analysis

In [1]:
save_files = True

if save_files:
    import os
    if (os.path.exists("./data/") is False):
        os.makedirs("./data/")
        
    assert(os.path.exists("./data/"))

## 1. Solar Flare: N = 1389, L = 3, d = 10

In [2]:
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset 
solar_flare = fetch_ucirepo(name="Solar Flare") 

print(solar_flare.data.version2)
  
# data (as pandas dataframes)
X = solar_flare.data.features 
y = solar_flare.data.targets 
  
# metadata 
print(solar_flare.metadata) 
  
# variable information 
print(solar_flare.variables) 

None
{'uci_id': 89, 'name': 'Solar Flare', 'repository_url': 'https://archive.ics.uci.edu/dataset/89/solar+flare', 'data_url': 'https://archive.ics.uci.edu/static/public/89/data.csv', 'abstract': 'Each class attribute counts the number of solar flares of a certain class that occur in a 24 hour period', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1389, 'num_features': 10, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['common flares', 'moderate flares', 'severe flares'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C5530G', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'Notes:\r\n\r\n   -- The database contains 3 potential classes, one for the number of times a certain type of solar flare occured in a 24 hour period.\r\n   -- Each instance represents c

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.copy()
for x in X.columns:
    if X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(X[x])      

X = clean_X.copy()
del clean_X

X, y

(      modified Zurich class  largest spot size  ...  area  area of largest spot
 0                         1                  4  ...     1                     2
 1                         2                  4  ...     1                     2
 2                         1                  4  ...     1                     1
 3                         2                  4  ...     1                     2
 4                         2                  0  ...     1                     2
 ...                     ...                ...  ...   ...                   ...
 1384                      5                  4  ...     1                     1
 1385                      5                  4  ...     1                     1
 1386                      1                  4  ...     1                     1
 1387                      5                  3  ...     1                     1
 1388                      0                  5  ...     1                     1
 
 [1389 rows x 10 columns],

In [5]:
if save_files:
    X.to_csv('./data/1-FLARE_X.csv', index=False)
    y.to_csv('./data/1-FLARE_y.csv', index=False)

## 0.2. Bridges, N = 105, L = 6 , d = 7, not used

In [6]:
print(os.getcwd())

/Users/rom/Documents/Education/Bachelor/Thesis/MCTS_ClassifierChain


In [7]:
from scipy.io import arff
import pandas as pd
import os

bridges_raw_file_name = os.getcwd() + "/data/" + "./datasets/bridges.arff"
assert(os.path.isfile(bridges_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(bridges_raw_file_name,)

In [8]:
df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  IDENTIF RIVER LOCATION ERECTED   PURPOSE  ...   T-OR-D MATERIAL   SPAN REL-L  TYPE
 0      E1     M        3  CRAFTS   HIGHWAY  ...  THROUGH     WOOD  SHORT     S  WOOD
 1      E2     A       25  CRAFTS   HIGHWAY  ...  THROUGH     WOOD  SHORT     S  WOOD
 2      E3     A       39  CRAFTS  AQUEDUCT  ...  THROUGH     WOOD      ?     S  WOOD
 3      E5     A       29  CRAFTS   HIGHWAY  ...  THROUGH     WOOD  SHORT     S  WOOD
 4      E6     M       23  CRAFTS   HIGHWAY  ...  THROUGH     WOOD      ?     S  WOOD
 
 [5 rows x 13 columns],
 Index(['IDENTIF', 'RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES',
        'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE'],
       dtype='object'))

In [9]:
X = df.drop(columns=["TYPE"])
y = pd.DataFrame(df["TYPE"])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])

y = pd.DataFrame(df["TYPE"])
y = pd.get_dummies(y["TYPE"], prefix="TYPE")

X, y

(     IDENTIF  RIVER  LOCATION  ERECTED  ...  T-OR-D  MATERIAL  SPAN  REL-L
 0          0      1        22        0  ...       2         2     3      2
 1         17      0        17        0  ...       2         2     3      2
 2         28      0        33        0  ...       2         2     0      2
 3         50      0        21        0  ...       2         2     3      2
 4         61      1        15        0  ...       2         2     0      2
 ..       ...    ...       ...      ...  ...     ...       ...   ...    ...
 100       90      0        26        3  ...       1         1     2      3
 101       89      1        53        3  ...       1         1     1      1
 102       88      0        16        3  ...       2         1     2      1
 103       96      2        41        3  ...       2         1     1      1
 104       95      1        51        3  ...       2         1     1      1
 
 [105 rows x 12 columns],
      TYPE_ARCH  TYPE_CANTILEV  ...  TYPE_SUSPEN  TYPE_WOOD


In [10]:
if save_files and False:  # and False since we are not using bridges
    X.to_csv("./data/2-BRIDGES_X.csv")
    y.to_csv("./data/2-BRIDGES_y.csv")

## 0.3. Parkinson's: L = 1, Not usable

In [12]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
X = parkinsons.data.features 
y = parkinsons.data.targets 
  
# metadata 
print(parkinsons.metadata) 
  
# variable information 
print(parkinsons.variables) 

{'uci_id': 174, 'name': 'Parkinsons', 'repository_url': 'https://archive.ics.uci.edu/dataset/174/parkinsons', 'data_url': 'https://archive.ics.uci.edu/static/public/174/data.csv', 'abstract': "Oxford Parkinson's Disease Detection Dataset", 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 197, 'num_features': 22, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['status'], 'index_col': ['name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2007, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C59C74', 'creators': ['Max Little'], 'intro_paper': {'title': 'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice Disorder Detection', 'authors': 'Max A. Little, P. McSharry, S. Roberts, D. Costello, I. Moroz', 'published_in': 'BioMedical Engineering OnLine', 'year': 2007, 'url': 'https://www.semanticscholar.org/paper/27e1dcd0d64bfc9d936e597d4f29b8

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.drop(columns=["MDVP:Jitter", "MDVP:Shimmer"]).copy()
clean_X["MDVP:Jitter"] = X["MDVP:Jitter"].values.T[0]
clean_X["MDVP:Shimmer"] = X["MDVP:Shimmer"].values.T[0]


for x in clean_X.columns:
    if clean_X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(clean_X[x])

clean_y = y.copy()
for yc in clean_y.columns:
    if clean_y[yc].dtype == 'object':
        clean_y[yc] = label_encoder.fit_transform(clean_y[yc])
        
clean_X, clean_y

(     MDVP:Fo  MDVP:Fhi  MDVP:Flo  ...       PPE  MDVP:Jitter  MDVP:Shimmer
 0    119.992   157.302    74.997  ...  0.284654      0.00784       0.04374
 1    122.400   148.650   113.819  ...  0.368674      0.00968       0.06134
 2    116.682   131.111   111.555  ...  0.332634      0.01050       0.05233
 3    116.676   137.871   111.366  ...  0.368975      0.00997       0.05492
 4    116.014   141.781   110.655  ...  0.410335      0.01284       0.06425
 ..       ...       ...       ...  ...       ...          ...           ...
 190  174.188   230.978    94.261  ...  0.133050      0.00459       0.04087
 191  209.516   253.017    89.488  ...  0.168895      0.00564       0.02751
 192  174.688   240.005    74.287  ...  0.131728      0.01360       0.02308
 193  198.764   396.961    74.904  ...  0.123306      0.00740       0.02296
 194  214.289   260.277    77.973  ...  0.148569      0.00567       0.01884
 
 [195 rows x 20 columns],
      status
 0         1
 1         1
 2         1
 3      

In [14]:
if save_files and False:  # and False since we are not using parkinsons
    clean_X.to_csv("./data/3-PARKINS_X.csv")
    clean_y.to_csv("./data/3-PARKINS_y.csv")

## 0.4. Thyroid: Missing labels

In [15]:
import pandas as pd

assert(os.path.isfile(bridges_raw_file_name))
thyroid_csv_file = os.getcwd() + "/data/" + "./datasets/hypothyroid.csv"
df = pd.read_csv(thyroid_csv_file)

df.head(), df.columns

(  age sex on thyroxine  ... TBG referral source binaryClass
 0  41   F            f  ...   ?            SVHC           P
 1  23   F            f  ...   ?           other           P
 2  46   M            f  ...   ?           other           P
 3  70   F            t  ...   ?           other           P
 4  70   F            f  ...   ?             SVI           P
 
 [5 rows x 30 columns],
 Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
        'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
        'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
        'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
        'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
        'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
        'binaryClass'],
       dtype='object'))

In [16]:
X = df.drop(columns=["binaryClass"])
y = df["binaryClass"]

In [17]:
set(X["TBG measured"]), set(X["TBG"])  # Can drop

({'f'}, {'?'})

In [18]:
clean_X = X.drop(columns=["TBG measured", "TBG"])

clean_X

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,TSH,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,SVHC
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,?,f,?,other
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,?,t,109,t,0.91,t,120,other
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,?,f,?,other
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,SVI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,f,f,f,t,f,f,f,?,f,?,f,?,f,?,f,?,other
3768,68,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1,t,2.1,t,124,t,1.08,t,114,SVI
3769,74,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,t,5.1,t,1.8,t,112,t,1.07,t,105,other
3770,72,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,2,t,82,t,0.94,t,87,SVI


In [19]:
if save_files and False:
    pass

## 2. Music: 593, 6, 72

In [20]:
import pandas as pd

emot_csv_file = os.getcwd() + "/data/" + "./datasets/2-EMOT.csv"
df = pd.read_csv(emot_csv_file)

df.head(), df.columns

(   amazed-suprised  happy-pleased  relaxing-clam  ...    BHSUM1    BHSUM2    BHSUM3
 0                0              1              1  ...  0.245457  0.105065  0.405399
 1                1              0              0  ...  0.343547  0.276366  0.710924
 2                0              1              0  ...  0.188693  0.045941  0.457372
 3                0              0              1  ...  0.102839  0.241934  0.351009
 4                0              0              0  ...  0.195196  0.310801  0.683817
 
 [5 rows x 78 columns],
 Index(['amazed-suprised', 'happy-pleased', 'relaxing-clam', 'quiet-still',
        'sad-lonely', 'angry-aggresive', 'Mean_Acc1298_Mean_Mem40_Centroid',
        'Mean_Acc1298_Mean_Mem40_Rolloff', 'Mean_Acc1298_Mean_Mem40_Flux',
        'Mean_Acc1298_Mean_Mem40_MFCC_0', 'Mean_Acc1298_Mean_Mem40_MFCC_1',
        'Mean_Acc1298_Mean_Mem40_MFCC_2', 'Mean_Acc1298_Mean_Mem40_MFCC_3',
        'Mean_Acc1298_Mean_Mem40_MFCC_4', 'Mean_Acc1298_Mean_Mem40_MFCC_5',
        

In [21]:
targets = df.columns[:6]
targets

Index(['amazed-suprised', 'happy-pleased', 'relaxing-clam', 'quiet-still',
       'sad-lonely', 'angry-aggresive'],
      dtype='object')

In [22]:
X = df.drop(columns=targets)
y = df[targets]

X, y

(     Mean_Acc1298_Mean_Mem40_Centroid  ...    BHSUM3
 0                            0.034741  ...  0.405399
 1                            0.081374  ...  0.710924
 2                            0.110545  ...  0.457372
 3                            0.042481  ...  0.351009
 4                            0.074550  ...  0.683817
 ..                                ...  ...       ...
 588                          0.027142  ...  1.149211
 589                          0.094829  ...  0.335371
 590                          0.035169  ...  0.476993
 591                          0.054276  ...  1.255820
 592                          0.073194  ...  0.451701
 
 [593 rows x 72 columns],
      amazed-suprised  happy-pleased  ...  sad-lonely  angry-aggresive
 0                  0              1  ...           0                0
 1                  1              0  ...           0                1
 2                  0              1  ...           0                1
 3                  0              0  ..

In [23]:
if save_files:
    X.to_csv("./data/2-EMOT_X.csv", index=False)
    y.to_csv("./data/2-EMOT_y.csv", index=False)

## 3. Scence

In [24]:
import pandas as pd

scene_csv_file = os.getcwd() + "/data/" + "./datasets/3-SCENE.csv"
df = pd.read_csv(scene_csv_file)

df.head(), df.columns

(   beach  sunset  foliage  field  ...    Att291    Att292    Att293    Att294
 0      1       0        0      0  ...  0.157332  0.247298  0.014025  0.029709
 1      1       0        0      0  ...  0.251454  0.137833  0.082672  0.036320
 2      1       0        0      0  ...  0.017166  0.051125  0.112506  0.083924
 3      1       0        0      0  ...  0.019267  0.031290  0.049780  0.090959
 4      1       0        0      0  ...  0.198151  0.238796  0.164270  0.184290
 
 [5 rows x 300 columns],
 Index(['beach', 'sunset', 'foliage', 'field', 'mountain', 'urban', 'Att1',
        'Att2', 'Att3', 'Att4',
        ...
        'Att285', 'Att286', 'Att287', 'Att288', 'Att289', 'Att290', 'Att291',
        'Att292', 'Att293', 'Att294'],
       dtype='object', length=300))

In [25]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

(          Att1      Att2      Att3  ...    Att292    Att293    Att294
 0     0.646467  0.666435  0.685047  ...  0.247298  0.014025  0.029709
 1     0.770156  0.767255  0.761053  ...  0.137833  0.082672  0.036320
 2     0.793984  0.772096  0.761820  ...  0.051125  0.112506  0.083924
 3     0.938563  0.949260  0.955621  ...  0.031290  0.049780  0.090959
 4     0.512130  0.524684  0.520020  ...  0.238796  0.164270  0.184290
 ...        ...       ...       ...  ...       ...       ...       ...
 2402  0.875782  0.901653  0.926227  ...  0.279607  0.254413  0.134350
 2403  0.657706  0.669877  0.692338  ...  0.199491  0.048747  0.041638
 2404  0.952281  0.944987  0.905556  ...  0.031900  0.017547  0.019734
 2405  0.883990  0.899004  0.901019  ...  0.256158  0.226332  0.223070
 2406  0.974915  0.866425  0.818144  ...  0.005131  0.025059  0.004033
 
 [2407 rows x 294 columns],
       beach  sunset  foliage  field  mountain  urban
 0         1       0        0      0         1      0
 1        

In [26]:
if save_files:
    X.to_csv("./data/3-SCENE_X.csv", index=False)
    y.to_csv("./data/3-SCENE_y.csv", index=False)

## 4. Flags

In [27]:
from scipy.io import arff
import pandas as pd
import os

flags_raw_file_name = os.getcwd() + "/data/" + "./datasets/flags.arff"
assert(os.path.isfile(flags_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(flags_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  landmass zone    area  population language  ... blue  yellow  white  black  orange
 0        5    1   648.0        16.0       10  ...    0       1      1      1       0
 1        3    1    29.0         3.0        6  ...    0       1      0      1       0
 2        4    1  2388.0        20.0        8  ...    0       0      1      0       0
 3        6    3     0.0         0.0        1  ...    1       1      1      0       1
 4        3    1     0.0         0.0        6  ...    1       1      0      0       0
 
 [5 rows x 26 columns],
 Index(['landmass', 'zone', 'area', 'population', 'language', 'religion',
        'bars', 'stripes', 'colours', 'circles', 'crosses', 'saltires',
        'quarters', 'sunstars', 'crescent', 'triangle', 'icon', 'animate',
        'text', 'red', 'green', 'blue', 'yellow', 'white', 'black', 'orange'],
       dtype='object'))

In [28]:
columns = ["red", "green", "blue", "yellow", 'white', 'black', "orange"]

X = df.drop(columns=columns)
y = df[columns]

X, y

(    landmass zone    area  population  ... triangle icon  animate  text
 0          5    1   648.0        16.0  ...        0    1        0     0
 1          3    1    29.0         3.0  ...        0    0        1     0
 2          4    1  2388.0        20.0  ...        0    0        0     0
 3          6    3     0.0         0.0  ...        1    1        1     0
 4          3    1     0.0         0.0  ...        0    0        0     0
 ..       ...  ...     ...         ...  ...      ...  ...      ...   ...
 189        6    3     3.0         0.0  ...        0    0        0     0
 190        3    1   256.0        22.0  ...        0    0        0     0
 191        4    2   905.0        28.0  ...        0    1        1     0
 192        4    2   753.0         6.0  ...        0    0        1     0
 193        4    2   391.0         8.0  ...        1    1        1     0
 
 [194 rows x 19 columns],
     red green blue yellow white black orange
 0     1     1    0      1     1     1      0
 1  

In [29]:
if save_files:
    X.to_csv("./data/4-FLAGS_X.csv", index=False)
    y.to_csv("./data/4-FLAGS_y.csv", index=False)

# 5. Foodtruck: N = 407, L = 12, d = 21

In [30]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = os.getcwd() + "/data/" + "./datasets/foodtruck.arff"
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(   frequency       time  expenses  ... healthy_food  fitness_food  sweets_desserts
 0        2.0     dinner      30.0  ...            0             0                1
 1        0.0     dinner      20.0  ...            0             0                1
 2        1.0  afternoon      15.0  ...            0             0                0
 3        0.0      lunch      40.0  ...            0             0                0
 4        0.0     dinner      15.0  ...            0             0                0
 
 [5 rows x 33 columns],
 Index(['frequency', 'time', 'expenses', 'motivation', 'taste', 'hygiene',
        'menu', 'presentation', 'attendance', 'ingredients', 'place.to.sit',
        'takeaway', 'variation', 'stop.strucks', 'schedule', 'gender',
        'age.group', 'scholarity', 'average.income', 'has.work',
        'marital.status', 'street_food', 'gourmet', 'italian_food',
        'brazilian_food', 'mexican_food', 'chinese_food', 'japanese_food',
        'arabic_food', 'snacks', 'healt

In [31]:
columns = ['gourmet', 'snacks', 'street_food', 'italian_food',
           'brazilian_food', 'mexican_food', 'chinese_food','japanese_food',
           'arabic_food', 'healthy_food', 'fitness_food', 'sweets_desserts']

X = df.drop(columns=columns)
y = df[columns]

X, y

(     frequency       time  expenses  ... average.income  has.work  marital.status
 0          2.0     dinner      30.0  ...            4.0       0.0          single
 1          0.0     dinner      20.0  ...            5.0       1.0         married
 2          1.0  afternoon      15.0  ...            4.0       1.0          single
 3          0.0      lunch      40.0  ...            6.0       1.0          single
 4          0.0     dinner      15.0  ...            4.0       1.0          single
 ..         ...        ...       ...  ...            ...       ...             ...
 402        0.0     dinner      30.0  ...            3.0       1.0          single
 403        0.0     dinner      30.0  ...            4.0       0.0          single
 404        0.0     dinner      30.0  ...            1.0       1.0          single
 405        1.0     dinner      30.0  ...            2.0       1.0         married
 406        0.0     dinner      15.0  ...            1.0       0.0          single
 
 [

In [32]:
if save_files:
    X.to_csv("./data/5-FOODTRUCK_X.csv", index=False)
    y.to_csv("./data/5-FOODTRUCK_y.csv", index=False)

## 6. Yeast

In [33]:
from scipy.io import arff
import pandas as pd
import os

yeast_raw_file_name = os.getcwd() + "/data/" + "./datasets/yeast.arff"
assert(os.path.isfile(yeast_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(yeast_raw_file_name)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,Att11,Att12,Att13,Att14,Att15,Att16,Att17,Att18,Att19,Att20,Att21,Att22,Att23,Att24,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,Att34,Att35,Att36,Att37,Att38,Att39,Att40,...,Att78,Att79,Att80,Att81,Att82,Att83,Att84,Att85,Att86,Att87,Att88,Att89,Att90,Att91,Att92,Att93,Att94,Att95,Att96,Att97,Att98,Att99,Att100,Att101,Att102,Att103,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.041850,0.066938,-0.056617,-0.027230,-0.137411,0.067776,0.047175,0.155671,0.050766,0.102557,-0.020259,-0.200512,-0.095371,-0.081940,-0.103735,0.093299,0.105475,0.148560,0.085925,0.107879,0.108075,0.085388,0.124026,-0.003650,-0.127376,0.039394,-0.018364,0.050378,0.157190,0.203563,0.111552,0.017907,-0.001126,...,-0.175325,-0.133636,0.005524,-0.014981,-0.031946,-0.015114,-0.047175,0.003829,0.010967,-0.006062,-0.027560,-0.019866,-0.024046,-0.025153,-0.009261,-0.025539,0.006166,-0.012976,-0.014259,-0.015024,-0.010747,0.000411,-0.032056,-0.018312,0.030126,0.124722,0,0,0,0,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.007970,0.049113,-0.030580,-0.077933,-0.080529,-0.016267,-0.215304,-0.009885,-0.155843,-0.059522,-0.098836,-0.071141,-0.023494,-0.071200,0.027767,0.003091,-0.003761,0.074600,0.053080,-0.008138,0.001794,-0.111704,-0.140291,-0.063347,0.066767,-0.167073,-0.095567,-0.047209,0.082206,0.144445,0.086581,-0.111850,-0.086560,0.024942,-0.131539,0.080062,...,-0.001249,-0.020209,-0.077359,-0.045139,-0.074738,0.051846,0.009323,0.184332,0.420424,-0.090224,-0.090718,-0.035266,-0.046729,0.000575,-0.066023,-0.051916,0.007680,0.027719,-0.085811,0.111123,0.050541,0.027565,-0.063569,-0.041471,-0.079758,0.017161,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,-0.040576,0.014326,-0.074968,0.141365,-0.015182,0.013691,0.006893,0.003736,-0.020726,-0.044104,-0.052959,-0.085572,-0.061547,-0.029578,0.027700,-0.094310,-0.047721,-0.081589,-0.022846,-0.106684,-0.068873,-0.105225,-0.065414,-0.047722,-0.070723,-0.057425,-0.042024,-0.034122,-0.049606,0.015137,...,-0.002432,0.001711,-0.083572,-0.096943,0.148457,-0.007413,0.130691,-0.032325,0.028612,-0.023051,-0.092214,-0.103336,0.138232,-0.100351,0.140423,0.110074,0.096277,-0.044932,-0.089470,-0.009162,-0.012010,0.308378,-0.028053,0.026710,-0.066565,-0.122352,0,1,1,0,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.007670,0.079438,0.062184,-0.013027,0.045538,0.080412,-0.010042,0.013029,-0.071975,0.089818,-0.016129,0.033105,0.024275,0.040428,0.064248,0.225613,0.176576,0.015501,0.009491,-0.013684,-0.017633,0.085007,-0.056274,-0.088925,-0.062951,0.227151,0.165897,0.150224,0.065105,0.110891,0.048451,0.114726,0.020393,...,-0.111806,-0.154732,0.302807,0.340027,-0.093332,-0.057848,-0.010558,-0.039194,-0.041628,-0.077455,-0.008553,-0.022404,-0.106131,-0.103067,-0.083059,-0.089064,-0.083809,0.200354,-0.075716,0.196605,0.152758,-0.028484,-0.074207,-0.089227,-0.049913,-0.043893,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,-0.139371,0.041487,-0.058531,0.021264,-0.101382,0.021015,0.096572,-0.005136,0.111104,-0.008323,0.020210,-0.003967,0.039762,0.006744,-0.041730,-0.174533,-0.101343,-0.115674,0.328511,-0.108945,-0.160748,-0.120290,-0.148308,-0.082882,-0.127218,-0.167186,-0.143210,-0.118028,-0.297516,-0.160082,...,0.108388,0.095516,0.015942,0.087354,0.176911,-0.062311,0.117205,-0.048277,-0.053679,0.014850,-0.066453,-0.067962,-0.083653,-0.081130,-0.061469,0.023662,-0.060467,0.044351,-0.057209,0.028047,0.029661,-0.050026,0.023248,-0.061539,-0.035160,0.067834,0,0,1,1,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,-0.169902,0.254843,-0.179968,0.173563,-0.060754,0.111926,0.055960,0.293560,0.017478,-0.081646,0.000432,-0.079165,-0.030758,0.057605,-0.069617,-0.152909,-0.013818,-0.134586,0.035602,-0.111456,-0.013309,-0.169705,-0.116210,-0.088351,-0.059824,0.055180,-0.015347,-0.054320,-0.158766,-0.038536,...,0.057122,0.103497,-0.064997,-0.019628,0.012507,0.209633,-0.081401,-0.057052,-0.077347,-0.076410,-0.077983,-0.021983,0.053034,0.191556,0.183931,0.065281,0.024084,-0.055915,-0.055593,-0.049642,0.018571,0.068742,-0.061001,-0.081132,-0.065844,0.001267,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,0.028501,0.037778,0.056401,0.073084,0.054199,0.010155,0.033694,0.022873,0.075112,0.092810,0.098595,0.059712,0.089367,0.056306,-0.010929,0.029214,0.028160,0.017634,0.005513,0.047156,0.100123,0.102521,-0.038055,-0.092468,-0.096875,-0.026086,0.048201,0.062117,0.052804,0.005024,...,-0.030045,-0.049208,-0.061023,-0.073127,-0.054131,0.230720,-0.054853,0.137628,0.150380,-0.029207,0.198999,0.240646,-0.102721,-0.099789,-0.078345,-0.084716,-0.079992,-0.075444,0.294987,-0.076379,-0.076293,-0.072451,-0.052258,-0.040026,0.342176,-0.169668,1,1,0,0,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,-0.008540,-0.013902,-0.068013,0.042327,0.052293,0.050993,0.082892,-0.009390,0.029446,0.053463,0.074409,0.091427,0.059711,-0.020000,-0.020922,-0.073768,-0.061061,-0.091906,-0.046665,-0.041211,-0.047039,0.040556,-0.071408,-0.162032,-0.163358,0.012145,0.076758,-0.005729,-0.026470,-0.108322,...,-0.083398,-0.059522,-0.004905,-0.069757,0.293519,0.164906,0.172683,-0.024597,-0.056481,0.086025,-0.070759,-0.076122,-0.076408,-0.064713,-0.040290,-0.077142,-0.006624,-0.036850,-0.064831,-0.068696,-0.068521,-0.039841,0.274575,-0.066957,0.260121,-0.125303,0,0,0,0,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,-0.255284,-0.334406,-0.194436,-0.046137,0.049138,0.014249,0.063691,-0.065423,-0.084182,0.013208,-0.043259,-0.122727,-0.119400,-0.082374,-0.033362,-0.012996,0.014870,-0.023433,-0.023071,0.020958,0.035999,-0.025947,-0.079952,-0.137046,0.058524,0.093744,0.109146,0.013801,-0.008219,0.063381,...,-0.023212,0.000583,0.038050,-0.113469,-0.130450,-0.109596,0.032658,0.147271,0.108450,0.023597,0.195494,-0.093824,-0.072476,-0.034463,-0.000725,-0.114834,0.085087,0.033166,-0.012710,0.135359,0.213512,-0.107561,-0.081925,-0.122332,-0.022453,0.001953,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [34]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

(          Att1      Att2      Att3  ...    Att101    Att102    Att103
 0     0.004168 -0.170975 -0.156748  ... -0.018312  0.030126  0.124722
 1    -0.103956  0.011879 -0.098986  ... -0.041471 -0.079758  0.017161
 2     0.509949  0.401709  0.293799  ...  0.026710 -0.066565 -0.122352
 3     0.119092  0.004412 -0.002262  ... -0.089227 -0.049913 -0.043893
 4     0.042037  0.007054 -0.069483  ... -0.061539 -0.035160  0.067834
 ...        ...       ...       ...  ...       ...       ...       ...
 2412 -0.119784  0.001259 -0.123645  ... -0.081132 -0.065844  0.001267
 2413  0.085327  0.058590  0.085268  ... -0.040026  0.342176 -0.169668
 2414  0.082526 -0.095571 -0.022019  ... -0.066957  0.260121 -0.125303
 2415 -0.130830  0.008868 -0.009457  ... -0.122332 -0.022453  0.001953
 2416 -0.171578 -0.066536  0.168206  ... -0.083342 -0.063135  0.018810
 
 [2417 rows x 103 columns],
      Class1 Class2 Class3 Class4 Class5  ... Class10 Class11 Class12 Class13 Class14
 0         0      0      0      

In [35]:
if save_files:
    X.to_csv("./data/6-YEAST_X.csv", index=False)
    y.to_csv("./data/6-YEAST_y.csv", index=False)

## 7. Birds

In [36]:
import pandas as pd

birds_csv_file = os.getcwd() + "/data/" + "./datasets/7-BIRDS.csv"
df = pd.read_csv(birds_csv_file)

df.columns

Index(['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10',
       ...
       'A251', 'A252', 'A253', 'A254', 'A255', 'A256', 'A257', 'A258', 'A259',
       'A260'],
      dtype='object', length=279)

In [37]:
X = df[df.columns[df.columns.str.startswith('A')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('L')]]  # Selecting columns with names starting with 'L'

X, y

(           A1        A2        A3  ...         A258  A259  A260
 0    0.016521  0.039926  0.089632  ...  1761.802180     1     1
 1    0.006600  0.035984  0.089956  ...     0.000000     0     1
 2    0.006894  0.017722  0.048062  ...   113.137085     1     1
 3    0.031046  0.127675  0.221428  ...     0.000000     0     1
 4    0.064721  0.226644  0.304482  ...     0.000000     0     1
 ..        ...       ...       ...  ...          ...   ...   ...
 640  0.065968  0.005699  0.009809  ...     0.000000     0    17
 641  0.037432  0.010440  0.021009  ...     0.000000     0    17
 642  0.200058  0.054787  0.137048  ...     0.000000     0    17
 643  0.064331  0.012261  0.022449  ...     0.000000     0    17
 644  0.008697  0.012031  0.021212  ...     0.000000     0    17
 
 [645 rows x 260 columns],
      L1  L2  L3  L4  L5  L6  L7  L8  ...  L12  L13  L14  L15  L16  L17  L18  L19
 0     0   0   0   0   0   0   0   0  ...    1    1    0    0    0    0    0    0
 1     0   0   0   0   0   

In [38]:
if save_files:
    X.to_csv("./data/7-BIRDS_X.csv", index=False)
    y.to_csv("./data/7-BIRDS_y.csv", index=False)

## 8. Genbase

In [39]:
from scipy.io import arff
import pandas as pd
import os

genbase_raw_file_name = os.getcwd() + "/data/" + "./datasets/genbase.arff"
assert(os.path.isfile(genbase_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(genbase_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  protein PS00010 PS00011 PS00012  ... PDOC50199 PDOC00660 PDOC00653 PDOC00030
 0  O00060      NO      NO      NO  ...         0         0         0         0
 1  O00139      NO      NO      NO  ...         0         0         0         0
 2  O02741      NO      NO      NO  ...         0         0         0         0
 3  O08424      NO      NO      NO  ...         0         0         0         0
 4  O12984      NO      NO      NO  ...         0         0         0         0
 
 [5 rows x 1213 columns],
 Index(['protein', 'PS00010', 'PS00011', 'PS00012', 'PS00014', 'PS00017',
        'PS00018', 'PS00019', 'PS00020', 'PS00021',
        ...
        'PDOC00662', 'PDOC00018', 'PDOC50001', 'PDOC00014', 'PDOC00750',
        'PDOC50196', 'PDOC50199', 'PDOC00660', 'PDOC00653', 'PDOC00030'],
       dtype='object', length=1213))

In [40]:
columns = df.columns[df.columns.str.startswith('PDOC')]

X = df.drop(columns=columns)
y = df[columns]

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])

X, y

(     protein  PS00010  PS00011  PS00012  ...  PS50827  PS50829  PS50830  PS60000
 0          0        0        0        0  ...        0        0        0        0
 1          1        0        0        0  ...        0        0        0        0
 2          2        0        0        0  ...        0        0        0        0
 3          3        0        0        0  ...        0        0        0        0
 4          4        0        0        0  ...        0        0        0        0
 ..       ...      ...      ...      ...  ...      ...      ...      ...      ...
 657      657        0        0        0  ...        0        0        0        0
 658      658        0        0        0  ...        0        0        0        0
 659      659        0        0        0  ...        0        0        0        0
 660      660        0        0        0  ...        0        0        0        0
 661      661        0        0        0  ...        0        0        0        0
 
 [662 rows x 1

In [41]:
if save_files:
    X.to_csv("./data/8-GENBASE_X.csv", index=False)
    y.to_csv("./data/8-GENBASE_y.csv", index=False)

## 9. Medical

In [42]:
import pandas as pd

medc_csv_file = os.getcwd() + "/data/" + "./datasets/9-MEDC.csv"
df = pd.read_csv(medc_csv_file)

df.head(), df.columns

(   Class-0-593_70  Class-1-079_99  Class-2-786_09  ...  yesterday  zithromax  zone
 0               0               0               0  ...          0          0     0
 1               0               0               0  ...          0          0     0
 2               0               0               0  ...          0          0     0
 3               0               0               0  ...          0          0     0
 4               1               0               0  ...          0          0     0
 
 [5 rows x 1494 columns],
 Index(['Class-0-593_70', 'Class-1-079_99', 'Class-2-786_09', 'Class-3-759_89',
        'Class-4-753_0', 'Class-5-786_2', 'Class-6-V72_5', 'Class-7-511_9',
        'Class-8-596_8', 'Class-9-599_0',
        ...
        'x2', 'x5', 'xray', 'year', 'year-old', 'yearly', 'years', 'yesterday',
        'zithromax', 'zone'],
       dtype='object', length=1494))

In [43]:
X = df.drop(columns=df.columns[df.columns.str.startswith('Class')])  # Selecting columns with names starting with 'L'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'A'

X, y

(     -  /  0  00  04  0;  ...  year-old  yearly  years  yesterday  zithromax  zone
 0    0  0  0   0   0   0  ...         0       0      0          0          0     0
 1    1  0  0   0   0   0  ...         0       0      0          0          0     0
 2    1  0  0   0   0   0  ...         0       0      0          0          0     0
 3    1  0  0   0   0   0  ...         0       0      0          0          0     0
 4    0  0  0   0   0   0  ...         0       0      0          0          0     0
 ..  .. .. ..  ..  ..  ..  ...       ...     ...    ...        ...        ...   ...
 973  0  0  0   0   0   0  ...         0       0      1          0          0     0
 974  1  0  0   0   0   0  ...         1       0      0          0          0     0
 975  1  0  0   0   0   0  ...         1       0      0          0          0     0
 976  0  0  0   0   0   0  ...         0       0      0          0          0     0
 977  1  0  0   0   0   0  ...         0       0      0          0          

In [44]:
if save_files:
    X.to_csv("./data/9-MEDC_X.csv", index=False)
    y.to_csv("./data/9-MEDC_y.csv", index=False)

## 10. Enron

In [45]:
import pandas as pd

enron_csv_file = os.getcwd() + "/data/" + "./datasets/10-ENRON.csv"
df = pd.read_csv(enron_csv_file)

df.head(), df.columns

(   A.A8  C.C9  B.B12  C.C11  C.C5  ...  www  year  years  yesterday  york
 0     0     0      0      0     0  ...    0     0      0          0     0
 1     0     0      0      0     0  ...    0     0      0          0     0
 2     0     0      0      0     0  ...    0     0      0          0     0
 3     0     0      0      0     0  ...    0     0      0          0     0
 4     0     0      0      0     0  ...    0     1      0          0     0
 
 [5 rows x 1054 columns],
 Index(['A.A8', 'C.C9', 'B.B12', 'C.C11', 'C.C5', 'C.C7', 'B.B2', 'B.B3',
        'D.D16', 'A.A7',
        ...
        'workers', 'working', 'world', 'writer', 'writers', 'www', 'year',
        'years', 'yesterday', 'york'],
       dtype='object', length=1054))

In [46]:
filtered_columns = [col for col in df.columns if col[0].isupper()]

# Create a new DataFrame with the filtered columns
X = df.drop(columns=filtered_columns)
y = df[filtered_columns]

X, y

(      0  00  000  01  02  03  ...  writers  www  year  years  yesterday  york
 0     0   0    0   0   0   0  ...        0    0     0      0          0     0
 1     0   0    0   1   0   0  ...        0    0     0      0          0     0
 2     0   0    0   0   0   0  ...        0    0     0      0          0     0
 3     0   0    0   0   0   0  ...        0    0     0      0          0     0
 4     0   0    0   0   0   0  ...        0    0     1      0          0     0
 ...  ..  ..  ...  ..  ..  ..  ...      ...  ...   ...    ...        ...   ...
 1697  0   0    0   0   0   0  ...        0    0     1      1          0     0
 1698  0   0    0   0   1   0  ...        0    1     1      1          1     0
 1699  0   0    0   0   0   0  ...        0    0     1      0          0     0
 1700  0   0    0   0   1   0  ...        0    1     1      1          1     0
 1701  0   0    0   0   1   0  ...        0    1     1      1          1     0
 
 [1702 rows x 1001 columns],
       A.A8  C.C9  B.

In [47]:
if save_files:
    X.to_csv("./data/10-ENRON_X.csv", index=False)
    y.to_csv("./data/10-ENRON_y.csv", index=False)

## 11. MediaMill

In [51]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = os.getcwd() + "/data/" + "./datasets/mediamill.arff"
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(       Att1      Att2      Att3      Att4  ...  Class98  Class99  Class100  Class101
 0  0.380877  0.494079  0.540009  0.422926  ...        0        0         0         0
 1  0.508613  0.505837  0.437155  0.490723  ...        0        0         0         0
 2  0.449571  0.460490  0.453469  0.410779  ...        0        0         0         0
 3  0.416800  0.548996  0.520850  0.465410  ...        0        0         0         0
 4  0.501986  0.480820  0.435543  0.432002  ...        0        0         0         0
 
 [5 rows x 221 columns],
 Index(['Att1', 'Att2', 'Att3', 'Att4', 'Att5', 'Att6', 'Att7', 'Att8', 'Att9',
        'Att10',
        ...
        'Class92', 'Class93', 'Class94', 'Class95', 'Class96', 'Class97',
        'Class98', 'Class99', 'Class100', 'Class101'],
       dtype='object', length=221))

In [52]:
X = df[df.columns[df.columns.str.startswith('Att')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'L'

X, y

(           Att1      Att2      Att3  ...    Att118    Att119    Att120
 0      0.380877  0.494079  0.540009  ...  0.317983  0.547807  0.393778
 1      0.508613  0.505837  0.437155  ...  0.348174  0.584991  0.422205
 2      0.449571  0.460490  0.453469  ...  0.323834  0.571487  0.397564
 3      0.416800  0.548996  0.520850  ...  0.346506  0.589601  0.430145
 4      0.501986  0.480820  0.435543  ...  0.325957  0.578370  0.398771
 ...         ...       ...       ...  ...       ...       ...       ...
 43902  0.426864  0.528629  0.532957  ...  0.488400  0.624835  0.527380
 43903  0.344094  0.537319  0.516101  ...  0.410659  0.564902  0.453231
 43904  0.329709  0.453422  0.523771  ...  0.408484  0.570778  0.453345
 43905  0.470031  0.489588  0.488227  ...  0.422255  0.585876  0.464987
 43906  0.350638  0.548779  0.554297  ...  0.448091  0.589267  0.497985
 
 [43907 rows x 120 columns],
       Class1 Class2 Class3 Class4  ... Class98 Class99 Class100 Class101
 0          0      0      0    

In [53]:
if save_files:
    X.to_csv("./data/11-MEDIAMILL_X.csv", index=False)
    y.to_csv("./data/11-MEDIAMILL_y.csv", index=False)