# Cleaning up of datasets for analysis

In [1]:
save_files = True

if save_files:
    import os
    if (os.path.exists("./data/datasets") is False):
        os.makedirs("./data/datasets")
        
    assert(os.path.exists("./data/datasets"))

## 1. Solar Flare: N = 1389, L = 3, d = 10, K=5: cannot be tested

In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset 
solar_flare = fetch_ucirepo(name="Solar Flare") 

print(solar_flare.data.version2)
  
# data (as pandas dataframes)
X = solar_flare.data.features 
y = solar_flare.data.targets 
  
# metadata 
print(solar_flare.metadata) 
  
# variable information 
print(solar_flare.variables) 

None
{'uci_id': 89, 'name': 'Solar Flare', 'repository_url': 'https://archive.ics.uci.edu/dataset/89/solar+flare', 'data_url': 'https://archive.ics.uci.edu/static/public/89/data.csv', 'abstract': 'Each class attribute counts the number of solar flares of a certain class that occur in a 24 hour period', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1389, 'num_features': 10, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['common flares', 'moderate flares', 'severe flares'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C5530G', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'Notes:\r\n\r\n   -- The database contains 3 potential classes, one for the number of times a certain type of solar flare occured in a 24 hour period.\r\n   -- Each instance represents c

In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.copy()
for x in X.columns:
    if X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(X[x])      

X = clean_X.copy()
del clean_X

X, y

(      modified Zurich class  largest spot size  spot distribution  activity  \
 0                         1                  4                  2         1   
 1                         2                  4                  2         1   
 2                         1                  4                  2         1   
 3                         2                  4                  2         1   
 4                         2                  0                  2         1   
 ...                     ...                ...                ...       ...   
 1384                      5                  4                  3         1   
 1385                      5                  4                  3         2   
 1386                      1                  4                  2         1   
 1387                      5                  3                  3         1   
 1388                      0                  5                  2         1   
 
       evolution  previous 24 hour fla

In [4]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

(      Att1  Att2  Att3  Att4  Att5  Att6  Att7  Att8  Att9  Att10
 0        1     4     2     1     2     1     1     2     1      2
 1        2     4     2     1     3     1     1     2     1      2
 2        1     4     2     1     3     1     1     2     1      1
 3        2     4     2     1     3     1     1     2     1      2
 4        2     0     2     1     3     1     1     2     1      2
 ...    ...   ...   ...   ...   ...   ...   ...   ...   ...    ...
 1384     5     4     3     1     2     1     1     1     1      1
 1385     5     4     3     2     2     1     1     2     1      1
 1386     1     4     2     1     2     1     2     2     1      1
 1387     5     3     3     1     2     1     1     2     1      1
 1388     0     5     2     1     1     1     1     2     1      1
 
 [1389 rows x 10 columns],
       Class1  Class2  Class3
 0          0       0       0
 1          0       0       0
 2          0       0       0
 3          0       0       0
 4          0    

In [5]:
if save_files & False:
    X.to_csv("./data/datasets/1-FLARE_X.csv", index=False)
    y.to_csv("./data/datasets/1-FLARE_y.csv", index=False)

## 0.2. Bridges, N = 105, L = 6 , d = 7, not used

In [6]:
from scipy.io import arff
import pandas as pd
import os

bridges_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/bridges.arff"
assert(os.path.isfile(bridges_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(bridges_raw_file_name,)

AssertionError: 

In [None]:
df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

In [None]:
X = df.drop(columns=["TYPE"])
y = pd.DataFrame(df["TYPE"])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])

y = pd.DataFrame(df["TYPE"])
y = pd.get_dummies(y["TYPE"], prefix="TYPE")

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files and False:  # and False since we are not using bridges
    X.to_csv("./data/datasets/2-BRIDGES_X.csv")
    y.to_csv("./data/datasets/2-BRIDGES_y.csv")

## 0.3. Parkinson's: L = 1, Not usable

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
X = parkinsons.data.features 
y = parkinsons.data.targets 
  
# metadata 
print(parkinsons.metadata) 
  
# variable information 
print(parkinsons.variables) 

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

clean_X = X.drop(columns=["MDVP:Jitter", "MDVP:Shimmer"]).copy()
clean_X["MDVP:Jitter"] = X["MDVP:Jitter"].values.T[0]
clean_X["MDVP:Shimmer"] = X["MDVP:Shimmer"].values.T[0]


for x in clean_X.columns:
    if clean_X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(clean_X[x])

clean_y = y.copy()
for yc in clean_y.columns:
    if clean_y[yc].dtype == 'object':
        clean_y[yc] = label_encoder.fit_transform(clean_y[yc])
        
clean_X, clean_y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files and False:  # and False since we are not using parkinsons
    clean_X.to_csv("./data/3-PARKINS_X.csv")
    clean_y.to_csv("./data/3-PARKINS_y.csv")

## 0.4. Thyroid: Missing labels

In [None]:
import pandas as pd

assert(os.path.isfile(bridges_raw_file_name))
thyroid_csv_file = os.getcwd() + "/data/" + "./raw_datasets/hypothyroid.csv"
df = pd.read_csv(thyroid_csv_file)

df.head(), df.columns

In [None]:
X = df.drop(columns=["binaryClass"])
y = pd.DataFrame(df["binaryClass"])

X, y

In [None]:
set(X["TBG measured"]), set(X["TBG"])  # Can drop

In [None]:
X = X.drop(columns=["TBG measured", "TBG"])

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files and False:
    pass

## 2. Music: 593, 6, 72

In [None]:
import pandas as pd
import os

emot_csv_file = os.getcwd() + "/data/" + "./raw_datasets/2-EMOT.csv"
df = pd.read_csv(emot_csv_file)

df.head(), df.columns

In [None]:
targets = df.columns[:6]
targets

In [None]:
X = df.drop(columns=targets)
y = df[targets]

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/2-EMOT_X.csv", index=False)
    y.to_csv("./data/datasets/2-EMOT_y.csv", index=False)

## 3. Scence

In [None]:
import pandas as pd
import os

scene_csv_file = os.getcwd() + "/data/" + "./raw_datasets/3-SCENE.csv"
df = pd.read_csv(scene_csv_file)

df.head(), df.columns

In [None]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

In [None]:
new_y_cols = dict()
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/3-SCENE_X.csv", index=False)
    y.to_csv("./data/datasets/3-SCENE_y.csv", index=False)

## 4. Flags

In [None]:
from scipy.io import arff
import pandas as pd
import os

flags_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/flags.arff"
assert(os.path.isfile(flags_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(flags_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

In [None]:
columns = ["red", "green", "blue", "yellow", 'white', 'black', "orange"]

X = df.drop(columns=columns)
y = df[columns]

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/4-FLAGS_X.csv", index=False)
    y.to_csv("./data/datasets/4-FLAGS_y.csv", index=False)

# 5. Foodtruck: N = 407, L = 12, d = 21

In [None]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/foodtruck.arff"
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

In [None]:
columns = ['gourmet', 'snacks', 'street_food', 'italian_food',
           'brazilian_food', 'mexican_food', 'chinese_food','japanese_food',
           'arabic_food', 'healthy_food', 'fitness_food', 'sweets_desserts']

X = df.drop(columns=columns)
y = df[columns]

X, y

In [None]:
clean_X = X.copy()
for x in clean_X.columns:
    if clean_X[x].dtype == 'object':
        clean_X[x] = label_encoder.fit_transform(clean_X[x])

X = clean_X.copy()
del clean_X
X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/5-FOODTRUCK_X.csv", index=False)
    y.to_csv("./data/5-FOODTRUCK_y.csv", index=False)

## 6. Yeast

In [None]:
from scipy.io import arff
import pandas as pd
import os

yeast_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/yeast.arff"
assert(os.path.isfile(yeast_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(yeast_raw_file_name)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df

In [None]:
columns = df.columns[df.columns.str.startswith('Att')]

X = df[columns]
y = df.drop(columns=columns)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/6-YEAST_X.csv", index=False)
    y.to_csv("./data/datasets/6-YEAST_y.csv", index=False)

## 7. Birds

In [None]:
import pandas as pd

birds_csv_file = os.getcwd() + "/data/" + "./raw_datasets/7-BIRDS.csv"
df = pd.read_csv(birds_csv_file)

df.columns

In [None]:
X = df[df.columns[df.columns.str.startswith('A')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('L')]]  # Selecting columns with names starting with 'L'

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/7-BIRDS_X.csv", index=False)
    y.to_csv("./data/datasets/7-BIRDS_y.csv", index=False)

## 8. Genbase

In [None]:
from scipy.io import arff
import pandas as pd
import os

genbase_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/genbase.arff"
assert(os.path.isfile(genbase_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(genbase_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

In [None]:
columns = df.columns[df.columns.str.startswith('PDOC')]

X = df.drop(columns=columns)
y = df[columns]

for x in X.columns:
    if X[x].dtype == 'object':
        X[x] = label_encoder.fit_transform(X[x])

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/8-GENBASE_X.csv", index=False)
    y.to_csv("./data/datasets/8-GENBASE_y.csv", index=False)

## 9. Medical

In [None]:
import pandas as pd

medc_csv_file = os.getcwd() + "/data/" + "./raw_datasets/9-MEDC.csv"
df = pd.read_csv(medc_csv_file)

df.head(), df.columns

In [None]:
X = df.drop(columns=df.columns[df.columns.str.startswith('Class')])  # Selecting columns with names starting with 'L'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'A'

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/9-MEDC_X.csv", index=False)
    y.to_csv("./data/datasets/9-MEDC_y.csv", index=False)

## 10. Enron

In [None]:
import pandas as pd

enron_csv_file = os.getcwd() + "/data/" + "./raw_datasets/10-ENRON.csv"
df = pd.read_csv(enron_csv_file)

df.head(), df.columns

In [None]:
filtered_columns = [col for col in df.columns if col[0].isupper()]

# Create a new DataFrame with the filtered columns
X = df.drop(columns=filtered_columns)
y = df[filtered_columns]

X, y

In [None]:
new_X_cols = dict()
new_y_cols = dict()
for i, col in enumerate(X.columns):
    new_X_cols[col] = f"Att{i+1}"
for i, col in enumerate(y.columns):
    new_y_cols[col] = f"Class{i+1}"

X = X.rename(columns=new_X_cols)
y = y.rename(columns=new_y_cols)

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/10-ENRON_X.csv", index=False)
    y.to_csv("./data/datasets/10-ENRON_y.csv", index=False)

## 11. MediaMill

In [None]:
from scipy.io import arff
import pandas as pd
import os

mediamill_raw_file_name = os.getcwd() + "/data/" + "./raw_datasets/mediamill.arff"
assert(os.path.isfile(mediamill_raw_file_name))

# Load ARFF file
data, meta = arff.loadarff(mediamill_raw_file_name,)

df = pd.DataFrame(data)

# Optionally, decode byte strings to regular strings if necessary
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

df.head(), df.columns

In [None]:
X = df[df.columns[df.columns.str.startswith('Att')]]  # Selecting columns with names starting with 'A'
y = df[df.columns[df.columns.str.startswith('Class')]]  # Selecting columns with names starting with 'L'

X, y

In [None]:
if save_files:
    X.to_csv("./data/datasets/11-MEDIAMILL_X.csv", index=False)
    y.to_csv("./data/datasets/11-MEDIAMILL_y.csv", index=False)