In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification, make_multilabel_classification

from sklearn.neighbors import NearestNeighbors

def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, class_sep=2, 
                           weights=[0.1,0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10)
    n_labels=3
    n_classes=10
    length = 50
    X, y = make_multilabel_classification(
        n_samples=n_sample,
        n_features=20,
        n_classes=n_classes,
        n_labels=n_labels,
        length=length,
        allow_unlabeled=False,
        return_distributions=False,
        random_state=42,
    )
    # y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), pd.DataFrame(y)

def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified
    
  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels
    
    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target

In [3]:
X, y = create_dataset()                     #Creating a Dataframe
X_sub, y_sub = get_minority_instace(X, y)   #Getting minority instance of that datframe
X_res, y_res = MLSMOTE(X_sub, y_sub, 200)     #Applying MLSMOTE to augment the dataframe

In [4]:
y_sub.shape

(254, 10)

In [5]:
X_res.to_numpy().shape

(454, 20)

In [6]:
X.shape

(1000, 20)

In [7]:
y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,1,0,0,0,0,1,1,0
1,1,1,1,1,0,1,1,0,0,0
2,0,1,0,1,0,0,0,1,0,0
3,1,1,1,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
995,0,1,0,0,0,0,0,0,1,0
996,1,1,0,0,0,0,0,0,0,0
997,0,0,0,1,0,0,0,1,0,0
998,0,0,1,0,0,0,0,0,1,0


In [8]:
n_labels=3
n_classes=10
length = 50
X, y = make_multilabel_classification(
        n_samples=150,
        n_features=20,
        n_classes=n_classes,
        n_labels=n_labels,
        length=length,
        allow_unlabeled=False,
        return_distributions=False,
        random_state=42,
    )
y

array([[0, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 1]])

In [9]:
features = np.load('../hh_dataset/hh_npy/fib_hh102_feature.npy')
activities = np.load('../hh_dataset/hh_npy/fib_hh102_activity.npy')

In [10]:
out = []
for instance in features:
    out.append(np.sum([vector*((i+1)/len(instance)) for i, vector in enumerate(instance)], axis=0))
new_feature = np.array(out)
new_feature.shape

(4462, 56)

In [11]:
activities.shape

(4462, 17)

In [12]:
X, y = pd.DataFrame(new_feature), pd.DataFrame(activities)
X_sub, y_sub = get_minority_instace(X, y)   #Getting minority instance of that datframe
X_res, y_res = MLSMOTE(X_sub, y_sub, 1000)     #Applying MLSMOTE to augment the dataframe
y_res.shape

(1347, 17)

In [49]:
X_res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,4.544068,0.0,2.925301,1.299297,1.299297,1.299297,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.423579,0.0,0.895126,0.895126,0.895126,0.895126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.849343,1.55944,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.598558,5.162948,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.87522,0.0,2.728313,2.792073,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
