In [None]:
import os
import pandas as pd
from random import Random

In [None]:
dataset_dir = '../datasets/sensorless-drive'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [None]:
col_names = [ 'feature_{}'.format(i) for i in range(48) ] + [ 'class' ]
# Read CSV as dtype=str so that numeric values are not changed through conversion to a float type 
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt', sep=' ', dtype=str, header=None, names=col_names)

In [None]:
def split_by_class(df):
    # Split df into a separate df for each class value
    split_dfs = [ x.reset_index(drop=True) for _, x in df.groupby('class') ]
    # Sort dfs numerically by class value
    return sorted(split_dfs, key=lambda df: int(df['class'].iloc[0]))

class_dfs = split_by_class(df)

In [None]:
def random_interleaving_sequence(source_row_counts, seed=1):
    rand = Random(seed)
    source_count = len(source_row_counts)
    total_rows = sum(source_row_counts)

    remaining_row_counts = source_row_counts.copy()
    available_sources = list(range(source_count))

    result = []
    for i in range(total_rows):
        source = rand.choice(available_sources)
        result.append(source)
        remaining_row_counts[source] -= 1
        if remaining_row_counts[source] <= 0:
            available_sources.remove(source)
    return result

def random_interleave_dfs(dfs, seed=1):
    dfs = [ df.copy() for df in dfs ]
    row_counts = [ df.shape[0] for df in dfs ]
    interleaving_df = pd.DataFrame(random_interleaving_sequence(row_counts, seed))
    index_series = [ pd.Series(x.index.values) for _, x in interleaving_df.groupby(0) ]
    for df, interleaved_index in zip(dfs, index_series):
        df['interleaved_index'] = interleaved_index
    return pd.concat(dfs).set_index('interleaved_index').sort_index()

row_count = df.shape[0]
interleaved_df = random_interleave_dfs(class_dfs, seed=row_count)

In [None]:
new_class_dfs = split_by_class(interleaved_df)

# Verify dataframes split by class are in the same order when
# extracted from both the original and interleaved dataframes.
for old, new in zip(class_dfs, new_class_dfs):
    if old.equals(new):
        print('equal')
    else:
        print('not equal')

In [None]:
interleaved_df.to_csv(os.path.join(dataset_dir, 'sensorless-drive-interleaved.csv'), index=False)