In [1]:
#Mount Drive if running in Colab
from sys import path
import os

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'

  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)
else:
  root_PATH = os.path.abspath("../..")

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from data_cleaner import DataCleaner
from data_processor import DataProcessor
from model_evaluator import ModelEvaluator

import pandas as pd
from data_cleaner import DataCleaner
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

In [3]:
#Configuration variables
#remove under-represented categories below this threshold
underrepresented_threshold = 3000

In [4]:
#Load datasets
train_df = pd.read_csv(root_PATH + '/data/train.csv')
validation_df = pd.read_csv(root_PATH + '/data/validation.csv')

#Convert topics column to list
train_df["Topic"] = train_df["Topic"].apply(eval)
validation_df["Topic"] = validation_df["Topic"].apply(eval)

In [5]:
#Clean datasets for classical methods
train_df['Conversation'] = train_df['Conversation'].map(lambda row : DataCleaner.clean_text_for_classical_methods(str(row)))
validation_df['Conversation'] = validation_df['Conversation'].map(lambda row : DataCleaner.clean_text_for_classical_methods(str(row)))

In [6]:
#Generate boolean masks for our datasets
train_boolean_mask = DataProcessor.obtain_boolean_mask_from_dataset(train_df)
validation_boolean_mask = DataProcessor.obtain_boolean_mask_from_dataset(validation_df)

#Remove underrepresented topics
underrepresented_topics = DataProcessor.get_underrepresented_topics(train_df,underrepresented_threshold)

train_df, remaining_topics = DataProcessor.remove_topics_from_dataset(train_df,train_boolean_mask,underrepresented_topics)
validation_df, _ = DataProcessor.remove_topics_from_dataset(validation_df,validation_boolean_mask,underrepresented_topics)

#Get boolean masks of our new dataset
train_boolean_mask = train_df.iloc[:,9:]
validation_boolean_mask = validation_df.iloc[:,9:]

## Binary Relevance Classifiers

In [7]:
#Train and evaluate a Multinomial Naive Bayes (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = MultinomialNB(fit_prior=True, class_prior=None),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

total accuracy: 0.5236725734101104
accuracy per label: {'Satisfied users': 0.8403588585244807, 'Bugs': 0.9192504692520599, 'Design & UX': 0.9251932682212961, 'Dissatisfied users': 0.9310024496548214, 'Performance': 0.9515413737155219, 'Use cases': 0.9519613145420418, 'Gaming': 0.9571024082970127, 'Feature Requests': 0.9590812203734929, 'Complexity': 0.9695670155568988, 'Security & Accounts': 0.9718003372252092, 'Update': 0.9726274933986575, 'Pricing': 0.9716858079088856, 'Camera & Photos': 0.9750453345210448, 'Video': 0.9755988928832755, 'Customer Support': 0.9751153246587981, 'Notifications & Alerts': 0.9778067635924029, 'Frequency': 0.978926605796456, 'Advertising': 0.9841567779085675, 'Payment': 0.9828396907708459, 'Connectivity': 0.9819679954188274, 'Devices': 0.9840867877708142, 'Audio': 0.9856329335411829, 'Sign Up & Login': 0.9883052842554004, 'Location Services': 0.992644672796106, 'Privacy': 0.9947952788470715, 'Internationalization': 0.9956733369388859, 'no topic': 0.91280702

In [8]:
#Train and evaluate a SVC (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LinearSVC(),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

total accuracy: 0.6553876499220564
accuracy per label: {'Satisfied users': 0.8671714440237966, 'Bugs': 0.9654757738682277, 'Design & UX': 0.9715903668119492, 'Dissatisfied users': 0.9556644291031718, 'Performance': 0.9875544809595012, 'Use cases': 0.9615817771132249, 'Gaming': 0.9826615340565648, 'Feature Requests': 0.9692043393885407, 'Complexity': 0.9896923615308751, 'Security & Accounts': 0.9509941780930868, 'Update': 0.9534247446950657, 'Pricing': 0.954277351827697, 'Camera & Photos': 0.9966786498266154, 'Video': 0.9980720898418859, 'Customer Support': 0.9919447714185728, 'Notifications & Alerts': 0.9982884229949416, 'Frequency': 0.9881716667196895, 'Advertising': 0.9976012470969999, 'Payment': 0.9961441796837718, 'Connectivity': 0.9959278465307161, 'Devices': 0.9959787484490822, 'Audio': 0.9981039035408646, 'Sign Up & Login': 0.9937072503419973, 'Location Services': 0.9992746476632838, 'Privacy': 0.9982184328571883, 'Internationalization': 0.9991537556071645}


In [9]:
#Train and evaluate a Logistic Regression (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LogisticRegression(solver='sag'),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "C:\Users\prest\anaconda3\envs\NLP-seminar-env\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-7c1ff52b0968>", line 16, in <module>
    NB_pipeline.fit(x_train, train_boolean_mask)
  File "C:\Users\prest\anaconda3\envs\NLP-seminar-env\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\prest\anaconda3\envs\NLP-seminar-env\lib\site-packages\skmultilearn\problem_transform\br.py", line 161, in fit
    classifier.fit(self._ensure_input_format(
  File "C:\Users\prest\anaconda3\envs\NLP-seminar-env\lib\site-packages\sklearn\linear_model\_logistic.py", line 1407, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\prest\

TypeError: object of type 'NoneType' has no len()

In [None]:
#Train and evaluate a KNN classifier (without stemming)
from skmultilearn.adapt import BRkNNaClassifier

x_train = train_df["Conversation"].to_numpy()
x_test = validation_df["Conversation"].to_numpy()
stop_words = set(stopwords.words('english'))

classifier = BRkNNaClassifier(k=3)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask.to_numpy())

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

#If you use this method please cite the relevant paper: http://scikit.ml/api/skmultilearn.adapt.brknn.html