# Cleaning up of datasets for analysis

In [1]:
save_files = True

if save_files:
    import os
    if (os.path.exists("./data/") is False):
        os.makedirs("./data/")
        
    assert(os.path.exists("./data/"))

## 1. Solar Flare: N = 1389, L = 3, d = 10

In [2]:
!pip install ucimlrepo



In [3]:
from ucimlrepo import fetch_ucirepo, list_available_datasets

# fetch dataset 
solar_flare = fetch_ucirepo(name="Solar Flare") 

print(solar_flare.data.version2)
  
# data (as pandas dataframes)
X = solar_flare.data.features 
y = solar_flare.data.targets 
  
# metadata 
print(solar_flare.metadata) 
  
# variable information 
print(solar_flare.variables) 

None
{'uci_id': 89, 'name': 'Solar Flare', 'repository_url': 'https://archive.ics.uci.edu/dataset/89/solar+flare', 'data_url': 'https://archive.ics.uci.edu/static/public/89/data.csv', 'abstract': 'Each class attribute counts the number of solar flares of a certain class that occur in a 24 hour period', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1389, 'num_features': 10, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['common flares', 'moderate flares', 'severe flares'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C5530G', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'Notes:\r\n\r\n   -- The database contains 3 potential classes, one for the number of times a certain type of solar flare occured in a 24 hour period.\r\n   -- Each instance represents c

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.copy()
for x in X.columns:
    if X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(X[x])      

X = clean_X.copy()
del clean_X

X, y

(      modified Zurich class  largest spot size  spot distribution  activity  \
 0                         1                  4                  2         1   
 1                         2                  4                  2         1   
 2                         1                  4                  2         1   
 3                         2                  4                  2         1   
 4                         2                  0                  2         1   
 ...                     ...                ...                ...       ...   
 1384                      5                  4                  3         1   
 1385                      5                  4                  3         2   
 1386                      1                  4                  2         1   
 1387                      5                  3                  3         1   
 1388                      0                  5                  2         1   
 
       evolution  previous 24 hour fla

In [5]:
if save_files:
    X.to_csv('./data/1-FLARE_X.csv')
    y.to_csv('./data/1-FLARE_y.csv')

## 0.2. Bridges, N = 105, L = 6 , d = 7, not used

In [6]:
from scipy.io import arff
import pandas as pd
import os

bridges_raw_file_name = './bridges/bridges.arff'
assert(os.path.isfile(bridges_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(bridges_raw_file_name,)

In [7]:
df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  IDENTIF RIVER LOCATION ERECTED   PURPOSE  LENGTH LANES CLEAR-G   T-OR-D  \
 0      E1     M        3  CRAFTS   HIGHWAY       ?     2       N  THROUGH   
 1      E2     A       25  CRAFTS   HIGHWAY  MEDIUM     2       N  THROUGH   
 2      E3     A       39  CRAFTS  AQUEDUCT       ?     1       N  THROUGH   
 3      E5     A       29  CRAFTS   HIGHWAY  MEDIUM     2       N  THROUGH   
 4      E6     M       23  CRAFTS   HIGHWAY       ?     2       N  THROUGH   
 
   MATERIAL   SPAN REL-L  TYPE  
 0     WOOD  SHORT     S  WOOD  
 1     WOOD  SHORT     S  WOOD  
 2     WOOD      ?     S  WOOD  
 3     WOOD  SHORT     S  WOOD  
 4     WOOD      ?     S  WOOD  ,
 Index(['IDENTIF', 'RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES',
        'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE'],
       dtype='object'))

In [8]:
X = df.drop(columns=["TYPE"])
y = pd.DataFrame(df["TYPE"])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])

y = pd.DataFrame(df["TYPE"])
y = pd.get_dummies(y["TYPE"], prefix="TYPE")

X, y

(     IDENTIF  RIVER  LOCATION  ERECTED  PURPOSE  LENGTH  LANES  CLEAR-G  \
 0          0      1        22        0        1       0      1        2   
 1         17      0        17        0        1       2      1        2   
 2         28      0        33        0        0       0      0        2   
 3         50      0        21        0        1       2      1        2   
 4         61      1        15        0        1       0      1        2   
 ..       ...    ...       ...      ...      ...     ...    ...      ...   
 100       90      0        26        3        1       3      2        1   
 101       89      1        53        3        1       1      2        1   
 102       88      0        16        3        1       3      3        1   
 103       96      2        41        3        1       1      3        1   
 104       95      1        51        3        1       3      3        1   
 
      T-OR-D  MATERIAL  SPAN  REL-L  
 0         2         2     3      2  
 1        

In [9]:
if save_files and False:  # and False since we are not using bridges
    X.to_csv("./data/2-BRIDGES_X.csv")
    y.to_csv("./data/2-BRIDGES_y.csv")

## 0.3. Parkinson's: L = 1, Not usable

In [10]:
!pip install ucimlrepo



In [11]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
X = parkinsons.data.features 
y = parkinsons.data.targets 
  
# metadata 
print(parkinsons.metadata) 
  
# variable information 
print(parkinsons.variables) 

{'uci_id': 174, 'name': 'Parkinsons', 'repository_url': 'https://archive.ics.uci.edu/dataset/174/parkinsons', 'data_url': 'https://archive.ics.uci.edu/static/public/174/data.csv', 'abstract': "Oxford Parkinson's Disease Detection Dataset", 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 197, 'num_features': 22, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['status'], 'index_col': ['name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2007, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C59C74', 'creators': ['Max Little'], 'intro_paper': {'title': 'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice Disorder Detection', 'authors': 'Max A. Little, P. McSharry, S. Roberts, D. Costello, I. Moroz', 'published_in': 'BioMedical Engineering OnLine', 'year': 2007, 'url': 'https://www.semanticscholar.org/paper/27e1dcd0d64bfc9d936e597d4f29b8

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.drop(columns=["MDVP:Jitter", "MDVP:Shimmer"]).copy()
clean_X["MDVP:Jitter"] = X["MDVP:Jitter"].values.T[0]
clean_X["MDVP:Shimmer"] = X["MDVP:Shimmer"].values.T[0]


for x in clean_X.columns:
    if clean_X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(clean_X[x])

clean_y = y.copy()
for yc in clean_y.columns:
    if clean_y[yc].dtype == 'object':
        clean_y[yc] = label_encoder.fit_transform(clean_y[yc])
        
clean_X, clean_y

(     MDVP:Fo  MDVP:Fhi  MDVP:Flo  MDVP:RAP  MDVP:PPQ  Jitter:DDP  \
 0    119.992   157.302    74.997   0.00370   0.00554     0.01109   
 1    122.400   148.650   113.819   0.00465   0.00696     0.01394   
 2    116.682   131.111   111.555   0.00544   0.00781     0.01633   
 3    116.676   137.871   111.366   0.00502   0.00698     0.01505   
 4    116.014   141.781   110.655   0.00655   0.00908     0.01966   
 ..       ...       ...       ...       ...       ...         ...   
 190  174.188   230.978    94.261   0.00263   0.00259     0.00790   
 191  209.516   253.017    89.488   0.00331   0.00292     0.00994   
 192  174.688   240.005    74.287   0.00624   0.00564     0.01873   
 193  198.764   396.961    74.904   0.00370   0.00390     0.01109   
 194  214.289   260.277    77.973   0.00295   0.00317     0.00885   
 
      Shimmer:APQ3  Shimmer:APQ5  MDVP:APQ  Shimmer:DDA      NHR     HNR  \
 0         0.02182       0.03130   0.02971      0.06545  0.02211  21.033   
 1         0.03134

In [13]:
if save_files:
    clean_X.to_csv("3-PARKINS_X.csv")
    clean_y.to_csv("3-PARKINS_y.csv")

## 0.4. Thyroid: Missing labels

In [14]:
import pandas as pd

thyroid_csv_file = "./thyroid/hypothyroid.csv"
df = pd.read_csv(thyroid_csv_file)

df.head(), df.columns

(  age sex on thyroxine query on thyroxine on antithyroid medication sick  \
 0  41   F            f                  f                         f    f   
 1  23   F            f                  f                         f    f   
 2  46   M            f                  f                         f    f   
 3  70   F            t                  f                         f    f   
 4  70   F            f                  f                         f    f   
 
   pregnant thyroid surgery I131 treatment query hypothyroid  ... TT4 measured  \
 0        f               f              f                 f  ...            t   
 1        f               f              f                 f  ...            t   
 2        f               f              f                 f  ...            t   
 3        f               f              f                 f  ...            t   
 4        f               f              f                 f  ...            t   
 
    TT4 T4U measured   T4U FTI measured  F

In [15]:
X = df.drop(columns=["binaryClass"])
y = df["binaryClass"]

In [16]:
set(X["TBG measured"]), set(X["TBG"])  # Can drop

({'f'}, {'?'})

In [17]:
clean_X = X.drop(columns=["TBG measured", "TBG"])

clean_X

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TSH,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source
0,41,F,f,f,f,f,f,f,f,f,...,1.3,t,2.5,t,125,t,1.14,t,109,SVHC
1,23,F,f,f,f,f,f,f,f,f,...,4.1,t,2,t,102,f,?,f,?,other
2,46,M,f,f,f,f,f,f,f,f,...,0.98,f,?,t,109,t,0.91,t,120,other
3,70,F,t,f,f,f,f,f,f,f,...,0.16,t,1.9,t,175,f,?,f,?,other
4,70,F,f,f,f,f,f,f,f,f,...,0.72,t,1.2,t,61,t,0.87,t,70,SVI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,...,?,f,?,f,?,f,?,f,?,other
3768,68,F,f,f,f,f,f,f,f,f,...,1,t,2.1,t,124,t,1.08,t,114,SVI
3769,74,F,f,f,f,f,f,f,f,f,...,5.1,t,1.8,t,112,t,1.07,t,105,other
3770,72,M,f,f,f,f,f,f,f,f,...,0.7,t,2,t,82,t,0.94,t,87,SVI


## 2. Music: 593, 6, 72

In [18]:
import pandas as pd
df = pd.read_csv("./datasets/2-EMOT.csv")

df.head(), df.columns

(   amazed-suprised  happy-pleased  relaxing-clam  quiet-still  sad-lonely  \
 0                0              1              1            0           0   
 1                1              0              0            0           0   
 2                0              1              0            0           0   
 3                0              0              1            0           0   
 4                0              0              0            1           0   
 
    angry-aggresive  Mean_Acc1298_Mean_Mem40_Centroid  \
 0                0                          0.034741   
 1                1                          0.081374   
 2                1                          0.110545   
 3                0                          0.042481   
 4                0                          0.074550   
 
    Mean_Acc1298_Mean_Mem40_Rolloff  Mean_Acc1298_Mean_Mem40_Flux  \
 0                         0.089665                      0.091225   
 1                         0.272747             

In [19]:
targets = df.columns[:6]
targets

Index(['amazed-suprised', 'happy-pleased', 'relaxing-clam', 'quiet-still',
       'sad-lonely', 'angry-aggresive'],
      dtype='object')

In [20]:
X = df.drop(columns=targets)
y = df[targets]

X, y

(     Mean_Acc1298_Mean_Mem40_Centroid  Mean_Acc1298_Mean_Mem40_Rolloff  \
 0                            0.034741                         0.089665   
 1                            0.081374                         0.272747   
 2                            0.110545                         0.273567   
 3                            0.042481                         0.199281   
 4                            0.074550                         0.140880   
 ..                                ...                              ...   
 588                          0.027142                         0.047551   
 589                          0.094829                         0.204498   
 590                          0.035169                         0.065403   
 591                          0.054276                         0.238158   
 592                          0.073194                         0.140733   
 
      Mean_Acc1298_Mean_Mem40_Flux  Mean_Acc1298_Mean_Mem40_MFCC_0  \
 0                        0.

In [21]:
if save_files:
    X.to_csv("./data/2-EMOT_X.csv")
    y.to_csv("./data/2-EMOT_y.csv")

## 3. Scence

In [22]:
import pandas as pd
df = pd.read_csv("./datasets/3-SCENE.csv")

df.head(), df.columns

(   beach  sunset  foliage  field  mountain  urban      Att1      Att2  \
 0      1       0        0      0         1      0  0.646467  0.666435   
 1      1       0        0      0         0      1  0.770156  0.767255   
 2      1       0        0      0         0      0  0.793984  0.772096   
 3      1       0        0      0         0      0  0.938563  0.949260   
 4      1       0        0      0         0      0  0.512130  0.524684   
 
        Att3      Att4  ...    Att285    Att286    Att287    Att288    Att289  \
 0  0.685047  0.699053  ...  0.061538  0.049615  0.068962  0.653879  0.354982   
 1  0.761053  0.745630  ...  0.114123  0.160008  0.414088  0.361843  0.303399   
 2  0.761820  0.762213  ...  0.047596  0.038082  0.079977  0.004901  0.003460   
 3  0.955621  0.966743  ...  0.027527  0.016922  0.024174  0.036799  0.007694   
 4  0.520020  0.504467  ...  0.158730  0.023177  0.129994  0.167709  0.226580   
 
      Att290    Att291    Att292    Att293    Att294  
 0  0.12407

In [23]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

(          Att1      Att2      Att3      Att4      Att5      Att6      Att7  \
 0     0.646467  0.666435  0.685047  0.699053  0.652746  0.407864  0.150309   
 1     0.770156  0.767255  0.761053  0.745630  0.742231  0.688086  0.708416   
 2     0.793984  0.772096  0.761820  0.762213  0.740569  0.734361  0.722677   
 3     0.938563  0.949260  0.955621  0.966743  0.968649  0.869619  0.696925   
 4     0.512130  0.524684  0.520020  0.504467  0.471209  0.417654  0.364292   
 ...        ...       ...       ...       ...       ...       ...       ...   
 2402  0.875782  0.901653  0.926227  0.721366  0.795826  0.867642  0.794125   
 2403  0.657706  0.669877  0.692338  0.713920  0.727374  0.750354  0.684372   
 2404  0.952281  0.944987  0.905556  0.836604  0.875916  0.957034  0.953938   
 2405  0.883990  0.899004  0.901019  0.904298  0.846402  0.858145  0.851362   
 2406  0.974915  0.866425  0.818144  0.936140  0.938583  0.935087  0.930597   
 
           Att8      Att9     Att10  ...    Att285

In [24]:
if save_files:
    X.to_csv("./data/3-SCENE_X.csv")
    y.to_csv("./data/3-SCENE_y.csv")

## 4. Flags

In [25]:
from scipy.io import arff
import pandas as pd
import os

flags_raw_file_name = './flags/flags.arff'
assert(os.path.isfile(flags_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(flags_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  landmass zone    area  population language religion  bars  stripes  colours  \
 0        5    1   648.0        16.0       10        2   0.0      3.0      5.0   
 1        3    1    29.0         3.0        6        6   0.0      0.0      3.0   
 2        4    1  2388.0        20.0        8        2   2.0      0.0      3.0   
 3        6    3     0.0         0.0        1        1   0.0      0.0      5.0   
 4        3    1     0.0         0.0        6        0   3.0      0.0      3.0   
 
    circles  ...  icon  animate  text  red green blue yellow white black orange  
 0      0.0  ...     1        0     0    1     1    0      1     1     1      0  
 1      0.0  ...     0        1     0    1     0    0      1     0     1      0  
 2      0.0  ...     0        0     0    1     1    0      0     1     0      0  
 3      0.0  ...     1        1     0    1     0    1      1     1     0      1  
 4      0.0  ...     0        0     0    1     0    1      1     0     0      0  
 
 [5 rows x 2

In [26]:
columns = ["red", "green", "blue", "yellow", 'white', 'black', "orange"]

X = df.drop(columns=columns)
y = df[columns]

X, y

(    landmass zone    area  population language religion  bars  stripes  \
 0          5    1   648.0        16.0       10        2   0.0      3.0   
 1          3    1    29.0         3.0        6        6   0.0      0.0   
 2          4    1  2388.0        20.0        8        2   2.0      0.0   
 3          6    3     0.0         0.0        1        1   0.0      0.0   
 4          3    1     0.0         0.0        6        0   3.0      0.0   
 ..       ...  ...     ...         ...      ...      ...   ...      ...   
 189        6    3     3.0         0.0        1        1   0.0      0.0   
 190        3    1   256.0        22.0        6        6   0.0      3.0   
 191        4    2   905.0        28.0       10        5   0.0      0.0   
 192        4    2   753.0         6.0       10        5   3.0      0.0   
 193        4    2   391.0         8.0       10        5   0.0      7.0   
 
      colours  circles  crosses  saltires  quarters  sunstars crescent  \
 0        5.0      0.0  

In [27]:
if save_files:
    X.to_csv("./data/4-FLAGS_X.csv")
    y.to_csv("./data/4-FLAGS_y.csv")

# 5. Foodtruck: N = 407, L = 12, d = 21

In [28]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = './foodtruck/foodtruck.arff'
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(   frequency       time  expenses      motivation  taste  hygiene  menu  \
 0        2.0     dinner      30.0          friend    5.0      4.0   4.0   
 1        0.0     dinner      20.0       by_chance    5.0      2.0   4.0   
 2        1.0  afternoon      15.0       by_chance    5.0      2.0   2.0   
 3        0.0      lunch      40.0          friend    5.0      5.0   4.0   
 4        0.0     dinner      15.0  social_network    5.0      4.0   2.0   
 
    presentation  attendance  ingredients  ...  italian_food  brazilian_food  \
 0           3.0         4.0          4.0  ...             0               0   
 1           4.0         4.0          4.0  ...             1               0   
 2           5.0         3.0          5.0  ...             0               1   
 3           3.0         4.0          4.0  ...             0               0   
 4           3.0         4.0          4.0  ...             0               0   
 
    mexican_food  chinese_food  japanese_food arabic_food  s

In [29]:
columns = ['gourmet', 'snacks', 'street_food', 'italian_food',
           'brazilian_food', 'mexican_food', 'chinese_food','japanese_food',
           'arabic_food', 'healthy_food', 'fitness_food', 'sweets_desserts']

X = df.drop(columns=columns)
y = df[columns]

X, y

(     frequency       time  expenses      motivation  taste  hygiene  menu  \
 0          2.0     dinner      30.0          friend    5.0      4.0   4.0   
 1          0.0     dinner      20.0       by_chance    5.0      2.0   4.0   
 2          1.0  afternoon      15.0       by_chance    5.0      2.0   2.0   
 3          0.0      lunch      40.0          friend    5.0      5.0   4.0   
 4          0.0     dinner      15.0  social_network    5.0      4.0   2.0   
 ..         ...        ...       ...             ...    ...      ...   ...   
 402        0.0     dinner      30.0          friend    5.0      5.0   5.0   
 403        0.0     dinner      30.0  social_network    5.0      5.0   4.0   
 404        0.0     dinner      30.0  social_network    5.0      5.0   4.0   
 405        1.0     dinner      30.0  social_network    5.0      4.0   4.0   
 406        0.0     dinner      15.0          friend    4.0      4.0   5.0   
 
      presentation  attendance  ingredients  ...  takeaway  va

In [30]:
if save_files:
    X.to_csv("./data/5-FOODTRUCK_X.csv")
    y.to_csv("./data/5-FOODTRUCK_y.csv")

## 6. Yeast

In [31]:
from scipy.io import arff
import pandas as pd
import os

yeast_raw_file_name = './yeast/yeast.arff'
assert(os.path.isfile(yeast_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(yeast_raw_file_name)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.041850,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.007970,0.049113,-0.030580,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.007670,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,-0.119784,0.001259,-0.123645,-0.015513,-0.059683,0.091032,-0.043302,0.229219,-0.071498,0.182709,...,0,0,0,0,0,0,0,0,0,0
2413,0.085327,0.058590,0.085268,-0.020897,0.068972,0.030125,0.078056,0.011346,0.052618,0.066093,...,0,0,0,0,0,0,0,1,1,0
2414,0.082526,-0.095571,-0.022019,-0.046793,-0.038360,0.041084,0.056509,0.011749,-0.029657,-0.012198,...,0,1,1,1,0,0,0,1,1,0
2415,-0.130830,0.008868,-0.009457,-0.058930,-0.041224,0.042269,0.117717,0.037388,-0.085563,0.136649,...,0,0,0,0,0,0,0,1,1,0


In [32]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

(          Att1      Att2      Att3      Att4      Att5      Att6      Att7  \
 0     0.004168 -0.170975 -0.156748 -0.142151  0.058781  0.026851  0.197719   
 1    -0.103956  0.011879 -0.098986 -0.054501 -0.007970  0.049113 -0.030580   
 2     0.509949  0.401709  0.293799  0.087714  0.011686 -0.006411 -0.006255   
 3     0.119092  0.004412 -0.002262  0.072254  0.044512 -0.051467  0.074686   
 4     0.042037  0.007054 -0.069483  0.081015 -0.048207  0.089446 -0.004947   
 ...        ...       ...       ...       ...       ...       ...       ...   
 2412 -0.119784  0.001259 -0.123645 -0.015513 -0.059683  0.091032 -0.043302   
 2413  0.085327  0.058590  0.085268 -0.020897  0.068972  0.030125  0.078056   
 2414  0.082526 -0.095571 -0.022019 -0.046793 -0.038360  0.041084  0.056509   
 2415 -0.130830  0.008868 -0.009457 -0.058930 -0.041224  0.042269  0.117717   
 2416 -0.171578 -0.066536  0.168206  0.246831  0.079555  0.016528 -0.088908   
 
           Att8      Att9     Att10  ...     Att94

In [33]:
if save_files:
    X.to_csv("./data/6-YEAST_X.csv")
    y.to_csv("./data/6-YEAST_y.csv")

## 7. Birds

In [34]:
import pandas as pd
df = pd.read_csv("./datasets/7-BIRDS.csv")

df.columns

Index(['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10',
       ...
       'A251', 'A252', 'A253', 'A254', 'A255', 'A256', 'A257', 'A258', 'A259',
       'A260'],
      dtype='object', length=279)

In [35]:
X = df[df.columns[df.columns.str.startswith('A')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('L')]]  # Selecting columns with names starting with 'L'

X, y

(           A1        A2        A3        A4        A5        A6        A7  \
 0    0.016521  0.039926  0.089632  0.134119  0.170470  0.176872  0.171546   
 1    0.006600  0.035984  0.089956  0.123214  0.172273  0.177068  0.165507   
 2    0.006894  0.017722  0.048062  0.065802  0.103443  0.091397  0.084931   
 3    0.031046  0.127675  0.221428  0.272707  0.358743  0.349389  0.316029   
 4    0.064721  0.226644  0.304482  0.274662  0.346980  0.334063  0.307223   
 ..        ...       ...       ...       ...       ...       ...       ...   
 640  0.065968  0.005699  0.009809  0.014150  0.027981  0.027554  0.028538   
 641  0.037432  0.010440  0.021009  0.025018  0.089126  0.037404  0.037024   
 642  0.200058  0.054787  0.137048  0.162441  0.192939  0.177832  0.178606   
 643  0.064331  0.012261  0.022449  0.026526  0.044141  0.040997  0.039509   
 644  0.008697  0.012031  0.021212  0.028663  0.044081  0.041791  0.044002   
 
            A8        A9       A10  ...  A251  A252       A253

In [36]:
if save_files:
    X.to_csv("./data/7-BIRDS_X.csv")
    y.to_csv("./data/7-BIRDS_y.csv")

## 8. Genbase

In [37]:
from scipy.io import arff
import pandas as pd
import os

genbase_raw_file_name = './genbase/genbase.arff'
assert(os.path.isfile(genbase_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(genbase_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(  protein PS00010 PS00011 PS00012 PS00014 PS00017 PS00018 PS00019 PS00020  \
 0  O00060      NO      NO      NO      NO      NO      NO      NO      NO   
 1  O00139      NO      NO      NO      NO     YES      NO      NO      NO   
 2  O02741      NO      NO      NO      NO      NO      NO      NO      NO   
 3  O08424      NO      NO      NO      NO      NO      NO      NO      NO   
 4  O12984      NO      NO      NO      NO      NO      NO      NO      NO   
 
   PS00021  ... PDOC00662 PDOC00018 PDOC50001 PDOC00014 PDOC00750 PDOC50196  \
 0      NO  ...         0         0         0         0         0         0   
 1      NO  ...         0         0         0         0         0         0   
 2      NO  ...         0         0         0         0         0         0   
 3      NO  ...         0         0         0         0         0         0   
 4      NO  ...         0         0         0         0         0         0   
 
   PDOC50199 PDOC00660 PDOC00653 PDOC00030  
 0       

In [38]:
columns = df.columns[df.columns.str.startswith('PDOC')]

X = df.drop(columns=columns)
y = df[columns]

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])


X, y

(     protein  PS00010  PS00011  PS00012  PS00014  PS00017  PS00018  PS00019  \
 0          0        0        0        0        0        0        0        0   
 1          1        0        0        0        0        1        0        0   
 2          2        0        0        0        0        0        0        0   
 3          3        0        0        0        0        0        0        0   
 4          4        0        0        0        0        0        0        0   
 ..       ...      ...      ...      ...      ...      ...      ...      ...   
 657      657        0        0        0        0        0        0        0   
 658      658        0        0        0        0        1        0        0   
 659      659        0        0        0        0        0        0        0   
 660      660        0        0        0        0        0        0        0   
 661      661        0        0        0        0        1        0        0   
 
      PS00020  PS00021  ...  PS50821  

In [39]:
if save_files:
    X.to_csv("./data/8-GENBASE_X.csv")
    y.to_csv("./data/8-GENBASE_y.csv")

## 9. Medical

In [40]:
import pandas as pd
df = pd.read_csv("./datasets/9-MEDC.csv")

df.head(), df.columns

(   Class-0-593_70  Class-1-079_99  Class-2-786_09  Class-3-759_89  \
 0               0               0               0               0   
 1               0               0               0               0   
 2               0               0               0               0   
 3               0               0               0               0   
 4               1               0               0               0   
 
    Class-4-753_0  Class-5-786_2  Class-6-V72_5  Class-7-511_9  Class-8-596_8  \
 0              1              0              0              0              0   
 1              1              0              0              0              0   
 2              0              0              0              0              0   
 3              1              0              0              0              0   
 4              0              0              0              0              0   
 
    Class-9-599_0  ...  x2  x5  xray  year  year-old  yearly  years  yesterday  \
 0      

In [41]:
X = df.drop(columns=df.columns[df.columns.str.startswith('Class')])  # Selecting columns with names starting with 'L'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'A'

X, y

(     -  /  0  00  04  0;  0cm  1  1-1/2  1-1/2-year  ...  x2  x5  xray  year  \
 0    0  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 1    1  0  0   0   0   0    0  1      0           0  ...   0   0     0     0   
 2    1  0  0   0   0   0    0  1      0           0  ...   0   0     0     0   
 3    1  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 4    0  0  0   0   0   0    0  1      0           0  ...   0   0     0     0   
 ..  .. .. ..  ..  ..  ..  ... ..    ...         ...  ...  ..  ..   ...   ...   
 973  0  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 974  1  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 975  1  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 976  0  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 977  1  0  0   0   0   0    0  0      0           0  ...   0   0     0     0   
 
      year-old  yearly  ye

In [42]:
if save_files:
    X.to_csv("./data/9-MEDC_X.csv")
    y.to_csv("./data/9-MEDC_y.csv")

## 10. Enron

In [43]:
import pandas as pd
df = pd.read_csv("./datasets/10-ENRON.csv")

df.head(), df.columns

(   A.A8  C.C9  B.B12  C.C11  C.C5  C.C7  B.B2  B.B3  D.D16  A.A7  ...  \
 0     0     0      0      0     0     0     0     0      0     0  ...   
 1     0     0      0      0     0     0     0     0      0     0  ...   
 2     0     0      0      0     0     0     0     0      0     0  ...   
 3     0     0      0      0     0     0     0     0      0     0  ...   
 4     0     0      0      0     0     0     0     0      0     0  ...   
 
    workers  working  world  writer  writers  www  year  years  yesterday  york  
 0        0        0      0       0        0    0     0      0          0     0  
 1        0        0      0       0        0    0     0      0          0     0  
 2        0        0      0       0        0    0     0      0          0     0  
 3        0        0      0       0        0    0     0      0          0     0  
 4        0        0      0       0        0    0     1      0          0     0  
 
 [5 rows x 1054 columns],
 Index(['A.A8', 'C.C9', 'B.B12', '

In [44]:
filtered_columns = [col for col in df.columns if col[0].isupper()]

# Create a new DataFrame with the filtered columns
X = df.drop(columns=filtered_columns)
y = df[filtered_columns]

X, y

(      0  00  000  01  02  03  04  05  06  07  ...  workers  working  world  \
 0     0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 1     0   0    0   1   0   0   0   0   0   0  ...        0        0      0   
 2     0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 3     0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 4     0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 ...  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..  ...      ...      ...    ...   
 1697  0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 1698  0   0    0   0   1   0   1   0   0   0  ...        0        0      0   
 1699  0   0    0   0   0   0   0   0   0   0  ...        0        0      0   
 1700  0   0    0   0   1   0   1   0   0   1  ...        0        0      0   
 1701  0   0    0   0   1   0   1   0   1   1  ...        0        0      0   
 
       writer  writers  www  year  years  yesterda

In [45]:
if save_files:
    X.to_csv("./data/10-ENRON_X.csv")
    y.to_csv("./data/10-ENRON_y.csv")

## 11. MediaMill

In [46]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = './mediamill/mediamill.arff'
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

(       Att1      Att2      Att3      Att4      Att5      Att6      Att7  \
 0  0.380877  0.494079  0.540009  0.422926  0.158318  0.326975  0.390861   
 1  0.508613  0.505837  0.437155  0.490723  0.262201  0.459610  0.393838   
 2  0.449571  0.460490  0.453469  0.410779  0.231759  0.402147  0.349590   
 3  0.416800  0.548996  0.520850  0.465410  0.181603  0.357255  0.389347   
 4  0.501986  0.480820  0.435543  0.432002  0.250599  0.408353  0.357816   
 
        Att8      Att9     Att10  ...  Class92  Class93  Class94  Class95  \
 0  0.527121  0.254052  0.223731  ...        0        0        0        0   
 1  0.524006  0.370391  0.329424  ...        0        0        0        0   
 2  0.536456  0.318117  0.301615  ...        0        0        0        0   
 3  0.530189  0.290942  0.244521  ...        0        0        0        0   
 4  0.499186  0.353172  0.327622  ...        0        0        0        0   
 
    Class96  Class97  Class98  Class99  Class100  Class101  
 0        0      

In [47]:
X = df[df.columns[df.columns.str.startswith('Att')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'L'

X, y

(           Att1      Att2      Att3      Att4      Att5      Att6      Att7  \
 0      0.380877  0.494079  0.540009  0.422926  0.158318  0.326975  0.390861   
 1      0.508613  0.505837  0.437155  0.490723  0.262201  0.459610  0.393838   
 2      0.449571  0.460490  0.453469  0.410779  0.231759  0.402147  0.349590   
 3      0.416800  0.548996  0.520850  0.465410  0.181603  0.357255  0.389347   
 4      0.501986  0.480820  0.435543  0.432002  0.250599  0.408353  0.357816   
 ...         ...       ...       ...       ...       ...       ...       ...   
 43902  0.426864  0.528629  0.532957  0.401551  0.187178  0.328457  0.363949   
 43903  0.344094  0.537319  0.516101  0.430633  0.147998  0.310974  0.373041   
 43904  0.329709  0.453422  0.523771  0.314048  0.133553  0.249196  0.296952   
 43905  0.470031  0.489588  0.488227  0.356816  0.228900  0.370288  0.327771   
 43906  0.350638  0.548779  0.554297  0.404750  0.134141  0.284098  0.361868   
 
            Att8      Att9     Att10  

In [48]:
if save_files:
    X.to_csv("./data/11-MEDIAMILL_X.csv")
    y.to_csv("./data/11-MEDIAMILL_y.csv")