In [174]:
# -*- coding: utf-8 -*-
# Importing required Library
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors



In [175]:
def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, class_sep=2, 
                           weights=[0.1,0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10)
    y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), y


In [176]:

def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    1.takes value counts of first category in all columns and forms a list with those values
    2.devided the max of list with the each values of list 
    3.calucalte average
    4.iterate thorogh the each value of list and if that value is above avergae the append that column to the fnal list  
    """
    columns = df.columns
    print("columns",columns)
    n = len(columns)
    irpl = np.zeros(n)
    print(irpl)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    print("after iteration",(max(irpl)/irpl),irpl,np.average(irpl))    
    irpl = max(irpl)/irpl
    
    mir = np.average(irpl)
    print("mir",mir)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            print("columns",columns[i],irpl[i])
            tail_label.append(columns[i])
    print(tail_label)        
    return tail_label

In [177]:
def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified
    
  return
  index: list, a list containing index number of all the tail label
  retursn all the rows index of minroty elements
  """
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)


In [178]:
def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels
    
    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

In [179]:
def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

In [195]:
def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    print(indices2)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target

In [196]:
if __name__=='__main__':
    """
    main function to use the MLSMOTE
    """
    X, y = create_dataset()                     #Creating a Dataframe
      #Applying MLSMOTE to augment the dataframe

In [197]:
y['class_0'].value_counts()

0    900
1    100
Name: class_0, dtype: int64

In [198]:
y['class_1'].value_counts()

0    975
1     25
Name: class_1, dtype: int64

In [199]:
set(y[y['class_1']==1].index).union(set(y[y['class_3']==1].index))

{12,
 54,
 58,
 148,
 188,
 205,
 231,
 267,
 313,
 326,
 346,
 410,
 420,
 433,
 529,
 537,
 562,
 621,
 626,
 633,
 668,
 678,
 695,
 711,
 737,
 744,
 860,
 884,
 928,
 934,
 937,
 948,
 996}

In [200]:
y['class_2'].value_counts()

0    795
1    205
Name: class_2, dtype: int64

In [201]:
y['class_3'].value_counts()

0    992
1      8
Name: class_3, dtype: int64

In [202]:
y['class_4'].value_counts()

1    662
0    338
Name: class_4, dtype: int64

In [203]:
 X_sub, y_sub = get_minority_instace(X, y)   #Getting minority instance of that datframe
    

columns Index(['class_0', 'class_1', 'class_2', 'class_3', 'class_4'], dtype='object')
[0. 0. 0. 0. 0.]
after iteration [ 6.62       26.48        3.22926829 82.75        1.        ] [100.  25. 205.   8. 662.] 200.0
mir 24.015853658536585
columns class_1 26.48
columns class_3 82.75
['class_1', 'class_3']


In [204]:
y_sub['class_1'].value_counts()

1    25
0     8
Name: class_1, dtype: int64

In [205]:
X_res,y_res =MLSMOTE(X_sub, y_sub, 100)  

[[ 0 13  6 12  8]
 [ 1 28 13 29  8]
 [ 2 27 26 10 24]
 [ 3 13 29 21 25]
 [ 4 19 31 18 25]
 [ 5 24 30 10  2]
 [ 6  8  0 13 20]
 [ 7  8  6 13 18]
 [ 8  6 13  7  0]
 [ 9 28  1 31 16]
 [10 26 24 20 27]
 [11 21 32 13  6]
 [12 13  0 18 20]
 [13 12 32 18  0]
 [14 13 25 20  0]
 [15  2 27 10 26]
 [16 12  9 13  1]
 [17 31 25 13 20]
 [18 13 25 28 32]
 [19  4 25 18 17]
 [20  0 17  6 12]
 [21 11 13 32  3]
 [22 18 13 28 31]
 [23 12  1 31 13]
 [24  5 10 30 14]
 [25 17 13 18 32]
 [26 10  2 27 30]
 [27  2 10 26 24]
 [28 32  1 18 13]
 [29  1  8 28 25]
 [30 24 10  5 26]
 [31 17 25 13 12]
 [32 13 28 25 18]]
