# Scikit-MultiLearn
https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/

https://github.com/scikit-multilearn/scikit-multilearn


## Helper Functions

In [242]:
import json

def write_json(data,filename):
    print("")
    
def read_json(filename):
    if filename:
        with open(filename, 'r') as f:
            datastore = json.load(f)
    
    return datastore

In [254]:
import string

def pre_process(text):
    
    tr = str.maketrans("", "", string.punctuation)
    text=text.translate(tr)
    
    words = text.split()
    words = [word.lower() for word in words]
    words = [word for word in words if word not in string.punctuation]
    sentence = [' '.join(words)]
    
    return sentence

def pre_process_blob(essay):
    arr=[]
    for i in range(len(essay)):
        arr.extend(pre_process(essay[i]))
    return arr


In [253]:
sentence="At eight! o'clock on Thursday morning Arthur didn't feel very good."
essay=[sentence,sentence,sentence]
print(pre_process(sentence))
arr=pre_process_blob(essay)
arr[0]

['at eight oclock on thursday morning arthur didnt feel very good']


'at eight oclock on thursday morning arthur didnt feel very good'

## Prepare data

In [128]:
#Define data files

files = [json_mobile,json_fashion,json_beauty,train_mobile,train_fashion,train_beauty,val_mobile,val_fashion,val_beauty]

In [129]:
import pandas as pd
data=pd.read_csv(train_fashion)
df=pd.DataFrame(data)

In [130]:
df.head()

Unnamed: 0,itemid,title,image_path,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
0,2282553,retro floral dress,fashion_image/78d17fdb159bba51a4250dc3d583245e,2.0,,3.0,,
1,13822218,dress floral sifon,fashion_image/2f77dac9965bbfdb03cbd3724b3552c5,2.0,,,4.0,
2,33555935,korean white chiffon collar dress,fashion_image/6dbe2e7cba5ddbb750d2144d8f248f11,,13.0,10.0,4.0,
3,65755120,women s trendy apricot o neck solid chiffon bl...,fashion_image/dc9b21429604148fc0342d12694f3294,,3.0,,4.0,
4,65857438,big sale baju gamis pesta india aysilla pancar...,fashion_image/6c25c578dd8edce742a805f891f1a51f,,,6.0,17.0,


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275142 entries, 0 to 275141
Data columns (total 8 columns):
itemid               275142 non-null int64
title                275142 non-null object
image_path           275142 non-null object
Pattern              164078 non-null float64
Collar Type          113638 non-null float64
Fashion Trend        147084 non-null float64
Clothing Material    175499 non-null float64
Sleeves              177903 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 16.8+ MB


In [239]:
pd.options.display.max_seq_items = 2000
print(df.columns)

Index(['itemid', 'title', 'image_path', 'Pattern', 'Collar Type',
       'Fashion Trend', 'Clothing Material', 'Sleeves'],
      dtype='object')


In [275]:
df=df[:1000]

In [276]:
data = df.drop(columns='image_path')
data.head()

Unnamed: 0,itemid,title,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
0,2282553,retro floral dress,2.0,,3.0,,
1,13822218,dress floral sifon,2.0,,,4.0,
2,33555935,korean white chiffon collar dress,,13.0,10.0,4.0,
3,65755120,women s trendy apricot o neck solid chiffon bl...,,3.0,,4.0,
4,65857438,big sale baju gamis pesta india aysilla pancar...,,,6.0,17.0,


### Text Processing

https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

In [277]:
X_lower = pre_process_blob(data.title)

#### Bag of Words - CountVectorizer

In [300]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
X = bow.fit_transform(X_lower)
X

<1000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 10670 stored elements in Compressed Sparse Row format>

In [301]:
X.shape

(1000, 1000)

#### NLTK Tokenizer

In [333]:
import nltk
from nltk.tokenize import ToktokTokenizer
X=nltk.FreqDist(ToktokTokenizer().tokenize(X_lower))
pd.options.display.max_rows = 2000
X

FreqDist({"'": 2000, ',': 999, 'dress': 735, 'lengan': 436, 'wanita': 370, 'untuk': 321, 'neck': 283, 'model': 240, 'pesta': 192, 'panjang': 184, ...})

In [338]:
#To convert to CSR
#temp=pd.DataFrame(X)
#temp.head(columns=X.keys())

In [None]:
# To convert to sparse matrix
import scipy
x_csr = scipy.sparse.csr_matrix(X.values)
x_csr

### Convert Target to one-hot-encoding

Not sure if required or can leave as label encoding

In [353]:
y=data.iloc[:,2:]
y=y.fillna('999')
y.head()

Unnamed: 0,Pattern,Collar Type,Fashion Trend,Clothing Material,Sleeves
0,2,999,3,999,999
1,2,999,999,4,999
2,999,13,10,4,999
3,999,3,999,4,999
4,999,999,6,17,999


In [354]:
#Change y to one-hot-encoding
y_hot = pd.get_dummies(y, dummy_na=False)
y_hot.head()

Unnamed: 0,Pattern_1.0,Pattern_2.0,Pattern_3.0,Pattern_4.0,Pattern_5.0,Pattern_6.0,Pattern_7.0,Pattern_8.0,Pattern_9.0,Pattern_12.0,...,Clothing Material_15.0,Clothing Material_16.0,Clothing Material_17.0,Clothing Material_18.0,Clothing Material_999,Sleeves_0.0,Sleeves_1.0,Sleeves_2.0,Sleeves_3.0,Sleeves_999
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [400]:
import scipy
y_csr = scipy.sparse.csr_matrix(y_hot.values)
y_csr 

<1000x63 sparse matrix of type '<class 'numpy.uint8'>'
	with 5000 stored elements in Compressed Sparse Row format>

In [312]:
pd.options.display.max_rows = 2000
y_hot.columns

Index(['Pattern_1.0', 'Pattern_2.0', 'Pattern_3.0', 'Pattern_4.0',
       'Pattern_5.0', 'Pattern_6.0', 'Pattern_7.0', 'Pattern_8.0',
       'Pattern_9.0', 'Pattern_12.0', 'Pattern_13.0', 'Pattern_14.0',
       'Pattern_15.0', 'Pattern_16.0', 'Pattern_17.0', 'Pattern_18.0',
       'Pattern_19.0', 'Pattern_999', 'Collar Type_0.0', 'Collar Type_1.0',
       'Collar Type_2.0', 'Collar Type_3.0', 'Collar Type_4.0',
       'Collar Type_5.0', 'Collar Type_6.0', 'Collar Type_7.0',
       'Collar Type_8.0', 'Collar Type_10.0', 'Collar Type_11.0',
       'Collar Type_13.0', 'Collar Type_14.0', 'Collar Type_999',
       'Fashion Trend_1.0', 'Fashion Trend_3.0', 'Fashion Trend_4.0',
       'Fashion Trend_6.0', 'Fashion Trend_7.0', 'Fashion Trend_8.0',
       'Fashion Trend_9.0', 'Fashion Trend_10.0', 'Fashion Trend_999',
       'Clothing Material_1.0', 'Clothing Material_2.0',
       'Clothing Material_3.0', 'Clothing Material_4.0',
       'Clothing Material_5.0', 'Clothing Material_6.0',
       

### Generate test data

In [386]:
from sklearn.datasets import make_multilabel_classification

# this will generate a random multi-label dataset
X, y = make_multilabel_classification(sparse = True, n_labels = 20,
return_indicator = 'sparse', allow_unlabeled = False)

#sparse: If True, returns a sparse matrix, where sparse matrix means a matrix having a large number of zero elements.
#n_labels:  The average number of labels for each instance.
#return_indicator: If ‘sparse’ return Y in the sparse binary indicator format.
#allow_unlabeled: If True, some instances might not belong to any class.

In [387]:
X

<100x20 sparse matrix of type '<class 'numpy.float64'>'
	with 1817 stored elements in Compressed Sparse Row format>

In [388]:
y

<100x5 sparse matrix of type '<class 'numpy.int32'>'
	with 468 stored elements in Compressed Sparse Row format>

### Train-test-split

In [313]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_csr, test_size=0.2, random_state=42)

## Evaluation Metrices

In [314]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

def evalResults(y_test, y_pred): 
    a=accuracy_score(y_test, y_pred)
    b=precision_score(y_test, y_pred,average='weighted')
    c=recall_score(y_test, y_pred,average='weighted')
    d=f1_score(y_test, y_pred,average='weighted')
    #e=confusion_matrix(y_test, y_pred)
    
    print("accuracy_score "+str(a)+" precision_score "+str(b)+" recall_score "+str(c)+" f1_score "+str(d))
    #print(e)
    
    #return a
    return {'accuracy_score':a,'precision_score':b,'recall_score':c,'f1_score':d}

## Techniques for solving multi-label problems

### Problem Transformation

#### Binary relevance

Treats each label as a separate single class classification problem.

In [315]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(DecisionTreeClassifier())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

In [316]:
evalResults(y_test, predictions)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.235 precision_score 0.791542870821457 recall_score 0.787 f1_score 0.7860148466817766


{'accuracy_score': 0.235,
 'precision_score': 0.791542870821457,
 'recall_score': 0.787,
 'f1_score': 0.7860148466817766}

#### Classifier Chains

In this, the first classifier is trained just on the input data and then each next classifier is trained on the input space and all the previous classifiers in the chain.  

In [317]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

#accuracy_score(y_test,predictions)
evalResults(y_test, predictions)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.035 precision_score 0.570171049700501 recall_score 0.455 f1_score 0.4693481092944675


{'accuracy_score': 0.035,
 'precision_score': 0.570171049700501,
 'recall_score': 0.455,
 'f1_score': 0.4693481092944675}

#### Label Powerset

Label powerset gives a unique class to every possible label combination that is present in the training set.

In [318]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

#accuracy_score(y_test,predictions)
evalResults(y_test, predictions)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.1 precision_score 0.5660816062539339 recall_score 0.591 f1_score 0.5532625536021457


{'accuracy_score': 0.1,
 'precision_score': 0.5660816062539339,
 'recall_score': 0.591,
 'f1_score': 0.5532625536021457}

### Adapted Algorithm

http://scikit.ml/api/skmultilearn.adapt.mltsvm.html

http://scikit.ml/api/skmultilearn.adapt.mlknn.html

In [320]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN(k=20)

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

#accuracy_score(y_test,predictions)
evalResults(y_test, predictions)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.09 precision_score 0.6937640187822354 recall_score 0.599 f1_score 0.6275034959860508


{'accuracy_score': 0.09,
 'precision_score': 0.6937640187822354,
 'recall_score': 0.599,
 'f1_score': 0.6275034959860508}

http://scikit.ml/api/skmultilearn.adapt.mlaram.html



### Ensemble Approaches

In [321]:
from skmultilearn.ensemble import MajorityVotingClassifier
from skmultilearn.cluster import FixedLabelSpaceClusterer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

classifier = MajorityVotingClassifier(
    clusterer = FixedLabelSpaceClusterer(clusters = [[1,3,4], [0, 2, 5]]),
    classifier = ClassifierChain(classifier=GaussianNB())
)

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

#accuracy_score(y_test,predictions)
evalResults(y_test, predictions)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.0 precision_score 0.01964852607709751 recall_score 0.033 f1_score 0.024227799227799232


{'accuracy_score': 0.0,
 'precision_score': 0.01964852607709751,
 'recall_score': 0.033,
 'f1_score': 0.024227799227799232}

In [322]:
from skmultilearn.ensemble import MajorityVotingClassifier
from skmultilearn.ensemble.partition import LabelSpacePartitioningClassifier
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.cluster import NetworkXLabelGraphClusterer

classifier = LabelSpacePartitioningClassifier(
    classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
),
    clusterer  = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)


ImportError: cannot import name 'NetworkXLabelGraphClusterer' from 'skmultilearn.cluster' (C:\ProgramData\Anaconda3\lib\site-packages\skmultilearn\cluster\__init__.py)

In [323]:
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

classifier = LabelSpacePartitioningClassifier(
    classifier = BinaryRelevance(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
),
    clusterer  = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)

start=time.time()
classifier.fit(x_train,y_train)

NameError: name 'NetworkXLabelGraphClusterer' is not defined

### Neural Net

In [389]:
#pip install -U skorch torch

import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier
from skorch import NeuralNet

In [390]:
import numpy
nodes = 8
input_dim = X_train.shape[1]
hidden_dim = int(input_dim/nodes)
output_dim = len(numpy.unique(y_train.shape[1]))

In [391]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(800, 1000)
(800, 63)
(200, 1000)
(200, 63)


In [392]:
class MultiClassClassifierModule(nn.Module):
    def __init__(
            self,
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            dropout=0.5,
    ):
        super(MultiClassClassifierModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(self.hidden(X))
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        
        X = X.view(-1).float() #To match shape of Tensor
        
        return X

In [393]:
net = NeuralNetClassifier(
    MultiClassClassifierModule,
    #criterion=torch.nn.NLLLoss,
    criterion=torch.nn.BCEWithLogitsLoss,
    #criterion=torch.nn.BCELoss,
    #optimizer
    #lr
    max_epochs=20,
    verbose=0
)

In [395]:
import sklearn
y_train=sklearn.utils.as_float_array(y_train)

In [396]:
y_train.astype(numpy.float32)

<800x63 sparse matrix of type '<class 'numpy.float32'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [397]:
from skmultilearn.problem_transform import LabelPowerset
clf = LabelPowerset(classifier=net, require_dense=[False,False])
#y_shape=y_train.reshape
clf.fit(X_train.astype(numpy.float32),y_train.astype(numpy.float64))

predictions = clf.predict(X_test.astype(numpy.float32))



RuntimeError: output with type torch.IntTensor doesn't match the desired type torch.FloatTensor

In [None]:
#accuracy_score(y_test,predictions)
evalResults(y_test, predictions)

#### BERT Model

In [330]:
from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertPreTrainedModel, BertModel, BertConfig, BertForMaskedLM, BertForSequenceClassification

class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, config, num_labels=2):
        super(BertForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [331]:
bert_net = NeuralNet(
    BertForMultiLabelSequenceClassification,
    #criterion=torch.nn.NLLLoss,
    criterion=torch.nn.BCEWithLogitsLoss,
    #optimizer
    #lr
    max_epochs=20,
    verbose=0
)

In [332]:
from skmultilearn.problem_transform import LabelPowerset
clf = LabelPowerset(classifier=bert_net, require_dense=[True,True])
#y_shape=y_train.reshape
clf.fit(X_train.astype(numpy.float32),y_train.astype(numpy.float32))

predictions = clf.predict(X_test.astype(numpy.float32))

TypeError: __init__() missing 1 required positional argument: 'config'

In [None]:
evalResults(y_test, predictions)