In [1]:
#Mount Drive if running in Colab
from sys import path
import os

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'

  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)
else:
  root_PATH = os.path.abspath("../..")

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from data_cleaner import DataCleaner
from data_processor import DataProcessor
from model_evaluator import ModelEvaluator

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
from joblib import dump, load

In [3]:
#Configuration variables
models_location = root_PATH + '/models/classical_methods'

In [4]:
#Load datasets
train_df = pd.read_csv(root_PATH + '/data/train.csv')
validation_df = pd.read_csv(root_PATH + '/data/validation.csv')

#Get boolean mask of the datasets
train_boolean_mask = train_df.iloc[:,8:]
validation_boolean_mask = validation_df.iloc[:,8:]

#Get topics present in the dataset
remaining_topics = train_boolean_mask.columns.tolist()

In [5]:
#Clean datasets for classical methods
train_df['conversation'] = train_df['conversation'].map(lambda row : DataCleaner.clean_text_for_classical_methods(str(row)))
validation_df['conversation'] = validation_df['conversation'].map(lambda row : DataCleaner.clean_text_for_classical_methods(str(row)))

## Binary Relevance Classifiers

In [10]:
#Train and evaluate a Multinomial Naive Bayes Classifier
x_train = train_df["conversation"]
x_test = validation_df["conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = MultinomialNB(fit_prior=True, class_prior=None),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

#Save model
dump(NB_pipeline, models_location + '/naive_bayes.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

total accuracy: 0.5254664553215056
accuracy per label: {'Satisfied users': 0.8413256179617924, 'Bugs': 0.9189885988197773, 'Design & UX': 0.9262774605478423, 'Dissatisfied users': 0.9310772873975899, 'Performance': 0.9519126100490805, 'Use cases': 0.9518362202318431, 'Gaming': 0.9580492587004819, 'Feature Requests': 0.9593542514116201, 'Complexity': 0.9704116774567284, 'Pricing': 0.973142613422964, 'Security & Accounts': 0.9732635639669233, 'Update': 0.9723468861600748, 'Camera & Photos': 0.9738428534143065, 'Video': 0.9755807217564565, 'Customer Support': 0.9750523588538981, 'Notifications & Alerts': 0.9771912737365442, 'Frequency': 0.9789546053511067, 'Advertising': 0.9838371878361948, 'Payment': 0.9826595114871187, 'Connectivity': 0.9819083449509514, 'Devices': 0.9836844082017201, 'Audio': 0.9854923005430043, 'Sign Up & Login': 0.9880258961480435, 'Location Services': 0.9927684306348631, 'Privacy': 0.9948118582459625, 'Internationalization': 0.99584948659677, 'no topic': 0.911497730

In [7]:
#Train and evaluate a SVC
x_train = train_df["conversation"]
x_test = validation_df["conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LinearSVC(),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

#Save model
dump(NB_pipeline, models_location + '/SVC.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

total accuracy: 0.6850575151665617
accuracy per label: {'Satisfied users': 0.8664196729242659, 'Bugs': 0.9658601175130022, 'Design & UX': 0.9714811348980514, 'Dissatisfied users': 0.9550700558282248, 'Performance': 0.9877458001515065, 'Use cases': 0.9614868004761632, 'Gaming': 0.982672243123325, 'Feature Requests': 0.9691512454723119, 'Complexity': 0.9899165441246682, 'Pricing': 0.9899992997600087, 'Security & Accounts': 0.9942580320709916, 'Update': 0.9882232365092399, 'Camera & Photos': 0.9967152378587935, 'Video': 0.9982175709311282, 'Customer Support': 0.9924565055478105, 'Notifications & Alerts': 0.9979565723889006, 'Frequency': 0.9880195303299404, 'Advertising': 0.9974536727587546, 'Payment': 0.9962123382286474, 'Connectivity': 0.9956457804174703, 'Devices': 0.9961932407743381, 'Audio': 0.9979565723889006, 'Sign Up & Login': 0.9935386946253397, 'Location Services': 0.9992233701914202, 'Privacy': 0.9982112051130251, 'Internationalization': 0.9990896880112548, 'no topic': 0.8607203

In [8]:
#Train and evaluate a Logistic Regression
x_train = train_df["conversation"]
x_test = validation_df["conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LogisticRegression(solver='sag'),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

#Save model
dump(NB_pipeline, models_location + '/logistic_regression.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

KeyboardInterrupt: 

In [None]:
#Train and evaluate a KNN classifier
from skmultilearn.adapt import BRkNNaClassifier

x_train = train_df["conversation"].to_numpy()
x_test = validation_df["conversation"].to_numpy()
stop_words = set(stopwords.words('english'))

classifier = BRkNNaClassifier(k=3)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask.to_numpy())

#Save model
dump(NB_pipeline, models_location + '/KNN.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(validation_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, validation_boolean_mask.to_numpy(), predictions.todense())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

#If you use this method please cite the relevant paper: http://scikit.ml/api/skmultilearn.adapt.brknn.html