In [1]:
#Mount Drive if running in Colab
from sys import path
import os

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'

  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)
else:
  root_PATH = os.path.abspath("../..")

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from data_cleaner import DataCleaner
from data_processor import DataProcessor
from model_evaluator import ModelEvaluator

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
from joblib import dump, load

In [3]:
#Configuration variables
models_location = root_PATH + '/models/classical_methods'

In [4]:
#Load datasets
test_df = pd.read_csv(root_PATH + '/data/test.csv')

#Get boolean mask of the dataset
test_boolean_mask = test_df.iloc[:,8:]

#Get topics present in the dataset
remaining_topics = test_boolean_mask.columns.tolist()

#Clean dataset for classical methods
test_df['conversation'] = test_df['conversation'].map(lambda row : DataCleaner.clean_text_for_classical_methods(str(row)))

In [14]:
#Test Multinomial Naive Bayes
x_test = test_df["conversation"]
stop_words = set(stopwords.words('english'))

NB_pipeline = load(models_location + '/naive_bayes.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(test_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, test_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

total accuracy: 0.5243524371534608
accuracy per label: {'Satisfied users': 0.8418094201376289, 'Bugs': 0.9188294533671995, 'Design & UX': 0.9250934183806632, 'Dissatisfied users': 0.9304725346777941, 'Performance': 0.9508877133344792, 'Use cases': 0.9520080973206272, 'Gaming': 0.9569416063505402, 'Feature Requests': 0.9578773816116978, 'Complexity': 0.9697432665559015, 'Pricing': 0.9727733959729835, 'Security & Accounts': 0.9732699297850264, 'Update': 0.9720540585273316, 'Camera & Photos': 0.9742120708642871, 'Video': 0.9755807217564565, 'Customer Support': 0.9750523588538981, 'Notifications & Alerts': 0.9768475195589761, 'Frequency': 0.9789991660778284, 'Advertising': 0.9838881143810196, 'Payment': 0.9825894874879845, 'Connectivity': 0.9813672504121868, 'Devices': 0.9842573318310003, 'Audio': 0.9854986663611074, 'Sign Up & Login': 0.9879495063308061, 'Location Services': 0.992539261183151, 'Privacy': 0.9950155644252621, 'Internationalization': 0.9956712436898828, 'no topic': 0.9133378

In [None]:
#Test SVC
x_test = test_df["conversation"]
stop_words = set(stopwords.words('english'))

NB_pipeline = load(models_location + '/SVC.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(test_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, test_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

In [None]:
#Test Multinomial logistic regression
x_test = test_df["conversation"]
stop_words = set(stopwords.words('english'))

NB_pipeline = load(models_location + '/logistic_regression.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(test_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, test_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")

In [None]:
#Test Multinomial logistic regression
x_test = test_df["conversation"]
stop_words = set(stopwords.words('english'))

NB_pipeline = load(models_location + '/KNN.joblib')

predictions = NB_pipeline.predict(x_test)
total_accuracy = ModelEvaluator.get_total_accuracy(test_boolean_mask, predictions)
accuracy_per_label = ModelEvaluator.get_accuracy_per_label(remaining_topics, test_boolean_mask.to_numpy(), predictions.toarray())

print(f"total accuracy: {total_accuracy}")
print(f"accuracy per label: {accuracy_per_label}")