In [1]:
#Mount Drive if running in Colab
from sys import path
import os

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'

  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)
else:
  root_PATH = os.path.abspath("../..")

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from data_cleaner import DataCleaner
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

train_df = pd.read_csv(root_PATH + '/data/train.csv')
validation_df = pd.read_csv(root_PATH + '/data/validation.csv')

In [3]:
#Load and clean the datasets
#train_df = train_df.head(1000)
#validation_df = validation_df.head(1000)

train_df['Conversation'] = train_df['Conversation'].map(lambda com : DataCleaner.clean_text_naive_bayes(str(com)))
validation_df['Conversation'] = validation_df['Conversation'].map(lambda com : DataCleaner.clean_text_naive_bayes(str(com)))

In [4]:
#Obtain a list with all the topics in the dataset
train_df["Topic"] = train_df["Topic"].apply(eval)
validation_df["Topic"] = validation_df["Topic"].apply(eval)

def list_to_series(series):
    return pd.Series([x for _list in series for x in _list])

topics_list_train = list_to_series(train_df["Topic"]).unique()
topics_list_validation = list_to_series(validation_df["Topic"]).unique()

In [9]:
#Convert datasets into a boolean mask
def boolean_df(item_lists, unique_items):# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

train_unique_items_topics = list_to_series(train_df["Topic"]).value_counts().index.tolist()
train_data_bool_topics = boolean_df(train_df["Topic"],train_unique_items_topics)
train_boolean_mask = train_data_bool_topics.astype(int)

validation_unique_items_topics = list_to_series(validation_df["Topic"]).value_counts().index.tolist()
validation_data_bool_topics = boolean_df(validation_df["Topic"],validation_unique_items_topics)
validation_boolean_mask = validation_data_bool_topics.astype(int)

In [6]:
#Run Naive Bayes Classifier

x_train = train_df["Conversation"]
x_test = validation_df["Conversation"]
stop_words = set(stopwords.words('english'))

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for topic in topics_list:
    print('... Processing {}'.format(topic))
    # train the model using X_dtm & y
    NB_pipeline.fit(x_train, train_boolean_mask[topic])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(validation_boolean_mask[topic], prediction)))

... Processing Advertising
Test accuracy is 0.985
... Processing Dissatisfied users
Test accuracy is 0.937
... Processing Gaming
Test accuracy is 0.944
... Processing Performance
Test accuracy is 0.949
... Processing Video
Test accuracy is 0.972
... Processing Notifications & Alerts
Test accuracy is 0.971
... Processing Satisfied users
Test accuracy is 0.831
... Processing Complexity
Test accuracy is 0.953
... Processing Design & UX
Test accuracy is 0.904
... Processing Feature Requests
Test accuracy is 0.957
... Processing Bugs
Test accuracy is 0.887
... Processing Pricing
Test accuracy is 0.978
... Processing Use cases
Test accuracy is 0.943
... Processing Frequency
Test accuracy is 0.974
... Processing Security & Accounts
Test accuracy is 0.973
... Processing Internationalization
Test accuracy is 0.995
... Processing Audio
Test accuracy is 0.977
... Processing Sign Up & Login
Test accuracy is 0.993
... Processing Customer Support
Test accuracy is 0.982
... Processing Update
Test acc

KeyError: 'Battery'