In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import hamming_loss

In [None]:
data = pd.read_csv('data/clean_train_data.csv', low_memory=True)

In [None]:
train, test = train_test_split(data, random_state=42, test_size=0.20, shuffle=True)

In [None]:
vectorizer = TfidfVectorizer(strip_accents='unicode', ngram_range=(1, 3), norm=None)

In [None]:
vectorizer.fit(train.Text)
vectorizer.fit(train.Text)

In [None]:
x_train = vectorizer.transform(train.Text)
y_train = train.drop(labels=['Unique ID', 'Type', 'Text'], axis=1)

In [None]:
x_test = vectorizer.transform(test.Text)
y_test = test.drop(labels=['Unique ID', 'Type', 'Text'], axis=1)

### 1. Problem transformation
    1. Binary relevance
    2. Classifier chains
    3. Label powerset

### 1. BinaryRelevance

In [None]:
classifier = BinaryRelevance(GaussianNB)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
loss = hamming_loss(y_test, y_pred)
print('BinaryRelevance hamming loss is {:.2f}'.format(loss))

### 2. Classifier chains

In [None]:
classifier = BinaryRelevance(GaussianNB)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
loss = hamming_loss(y_test, y_pred)
print('BinaryRelevance hamming loss is {:.2f}'.format(loss))

### 3. LabelPowerset

In [None]:
classifier = LabelPowerset(GaussianNB)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
loss = hamming_loss(y_test, y_pred)
print('LabelPowerset hamming loss is {:.2f}'.format(loss))

### 2. Adapted algorithm
Adapt algorithm to perform multilabel classification.
For scikit_multilearn:-
    1. MLkNN
    2. MLrF
    3. Ridge regression

In [None]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

In [None]:
classifier = MLkNN(k=20)

In [None]:
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
y_test = lil_matrix(y_test).toarray()

In [None]:
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
loss = hamming_loss(y_test, y_pred)
print('Hamming loss is {:.2f}'.format(loss))

### 3. Ensemble methods
For better results.