In [1]:
#Mount Drive if running in Colab
from sys import path
import os

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'

  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)
else:
  root_PATH = os.path.abspath("../..")

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from data_cleaner import DataCleaner
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

In [3]:
#Load datasets
train_df = pd.read_csv(root_PATH + '/data/train.csv')
validation_df = pd.read_csv(root_PATH + '/data/validation.csv')

#train_df = train_df.head(100000)
#validation_df = validation_df.head(100000)

In [4]:
#Clean datasets for naive Bayes method
train_df['Conversation'] = train_df['Conversation'].map(lambda row : DataCleaner.clean_text_naive_bayes(str(row)))
validation_df['Conversation'] = validation_df['Conversation'].map(lambda row : DataCleaner.clean_text_naive_bayes(str(row)))

In [5]:
#Obtain a list with all the topics in the dataset
train_df["Topic"] = train_df["Topic"].apply(eval)
validation_df["Topic"] = validation_df["Topic"].apply(eval)

def list_to_series(series):
    return pd.Series([x for _list in series for x in _list])

In [6]:
#Add a boolean mask to the dataset
def boolean_df(item_lists, unique_items):# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

train_topics = list_to_series(train_df["Topic"]).value_counts().index.tolist()
train_boolean_mask = boolean_df(train_df["Topic"],train_topics).astype(int)

validation_topics = list_to_series(validation_df["Topic"]).value_counts().index.tolist()
validation_boolean_mask = boolean_df(validation_df["Topic"],train_topics).astype(int)

## Binary Relevance Classifiers

In [7]:
#Train and evaluate a Multinomial Naive Bayes (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = MultinomialNB(fit_prior=True, class_prior=None),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

prediction = NB_pipeline.predict(x_test)

print('Test accuracy is {}'.format(accuracy_score(validation_boolean_mask, prediction)))

Test accuracy is 0.5239436440624086


In [8]:
#Train and evaluate a Multinomial Naive Bayes (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LinearSVC(),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

prediction = NB_pipeline.predict(x_test)

print('Test accuracy is {}'.format(accuracy_score(validation_boolean_mask, prediction)))

Test accuracy is 0.6850705085004387


In [9]:
#Train and evaluate a Multinomial Naive Bayes (without stemming)
x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

classifier = BinaryRelevance(
    classifier = LogisticRegression(solver='sag'),
    require_dense = [False, True]
)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask)

prediction = NB_pipeline.predict(x_test)

print('Test accuracy is {}'.format(accuracy_score(validation_boolean_mask, prediction)))

Test accuracy is 0.6799270119400328


In [10]:
from skmultilearn.adapt import BRkNNaClassifier

x_train = train_df["Conversation"].to_numpy()
x_test = validation_df["Conversation"].to_numpy()
stop_words = set(stopwords.words('english'))

classifier = BRkNNaClassifier(k=3)

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', classifier),
            ])

NB_pipeline.fit(x_train, train_boolean_mask.to_numpy())

prediction = NB_pipeline.predict(x_test)

print('Test accuracy is {}'.format(accuracy_score(validation_boolean_mask.to_numpy(), prediction)))

#If you use this method please cite the relevant paper: http://scikit.ml/api/skmultilearn.adapt.brknn.html

Test accuracy is 0.5669481072695599
