https://towardsdatascience.com/multi-label-text-classification-5c505fdedca8

https://towardsdatascience.com/bert-multilabel-text-classification-a7f560db34e5

In [None]:
!pip install scikit-multilearn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import csv
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
from sklearn.naive_bayes import GaussianNB

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

stemmer = SnowballStemmer("english")

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
        stemSentence = stemSentence.strip()
    return stemSentence


meta = pd.read_csv("/Users/patsnap/Desktop/Neo4J_and_other_codes/MovieSummaries/movie.metadata.tsv", sep = '\t', header = None)
meta.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]
genres = meta[["movie_id","movie_name","genre"]]
plots = pd.read_csv("/Users/patsnap/Desktop/Neo4J_and_other_codes/MovieSummaries/plot_summaries.txt", sep = '\t', header = None)
plots.columns = ["movie_id", "plot"]
genres['movie_id'] = genres['movie_id'].astype(str)
plots['movie_id'] = plots['movie_id'].astype(str)
movies = pd.merge(plots, genres, on = 'movie_id')
genres_lists = []

for i in movies['genre']:
    genres_lists.append(list(json.loads(i).values()))
movies['genre'] = genres_lists
movies['plot'] = movies['plot'].apply(clean_text)
movies['plot'] = movies['plot'].apply(stemming)
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(movies['genre'])
# transform target variable
y = multilabel_binarizer.transform(movies['genre'])

for idx, genre in enumerate(multilabel_binarizer.classes_):
    movies[genre] = y[:,idx]
movies.to_csv('/Users/patsnap/Desktop/Neo4J_and_other_codes/MovieSummaries/movies.csv')
movies_new = pd.read_csv('/Users/patsnap/Desktop/Neo4J_and_other_codes/MovieSummaries/movies.csv')
movies = movies_new
movies.head(100)

In [None]:
train, test = train_test_split(movies, random_state=42, test_size=0.30, shuffle=True)
train_text = train['plot'].values.astype('U')
test_text = test['plot'].values.astype('U')

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)

In [None]:
#Binary Relevance
br_classifier = BinaryRelevance(GaussianNB())
br_classifier.fit(x_train, y_train)
br_predictions = br_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,br_predictions.toarray()))
print("F1 score = ",F1_score(y_test,br_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(y_test,br_predictions))

In [None]:
#Label Powerset
lp_classifier = LabelPowerset(LogisticRegression())
lp_classifier.fit(x_train, y_train)
lp_predictions = lp_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,lp_predictions))
print("F1 score = ",f1_score(y_test,lp_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(y_test,lp_predictions))

In [None]:
#MLkNN
ml_classifier = MLkNN(k=10)
# to prevent errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
ml_classifier.fit(x_train, y_train)
# predict
ml_predictions = ml_classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,ml_predictions))

In [None]:
#for the next classifier we need to remove from y-train, y-test categories which equal 0 for all train samples
y_train = train.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)
selected_labels = y_train.columns[y_train.sum(axis = 0, skipna = True) > 0].tolist()
y_test = test.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)
y_train = y_train.filter(selected_labels, axis=1)
y_test = y_test.filter(selected_labels, axis=1)
x_train = vectorizer.transform(train_text)
x_test = vectorizer.transform(test_text)
cc_classifier = ClassifierChain(LogisticRegression(solver='warn'))
cc_classifier.fit(x_train, y_train)
cc_predictions_proba = cc_classifier.predict_proba(x_test)
#for plotting metrics as a function of threashold
th = []
f = []
ham = []
ac = []
for t in range (5,60): # threshold value
    y_pred_new = (cc_predictions_proba >= t/100).astype(int)
    print("t =" ,t/100)
    print("Accuracy = ",accuracy_score(y_test,y_pred_new))
    print("F1 = ",f1_score(y_test,y_pred_new, average="micro"))
    print("Hamming loss = ",hamming_loss(y_test,y_pred_new))
    th.append(t)
    ac.append(accuracy_score(y_test,y_pred_new))
    f.append(f1_score(y_test,y_pred_new, average="micro"))
    ham.append(hamming_loss(y_test,y_pred_new))
plt.rcParams["figure.figsize"] = (12,6)
with plt.style.context('ggplot'):
    plt.plot(th, f)
    plt.plot(th, ham)
    plt.plot(th, ac)
    plt.legend(['F1', 'Hamming loss', 'Accuracy'], loc='center left', fontsize = 14)
    plt.ylabel("metrics", fontsize = 14)
    plt.xlabel("threshold", fontsize = 14)
    plt.title("Classfier Chain Model", fontsize = 18)
plt.show()