In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
import joblib
from sklearn.metrics import accuracy_score

In [2]:
corpus = pd.read_csv(r"./Sample Data 2.csv")

In [3]:
# Remove blank rows
corpus['Description'].dropna(inplace=True)
# Change all text to lower case
corpus['Description'] = [entry.lower() for entry in corpus['Description']]
# Tokenization
corpus['Description']= [word_tokenize(entry) for entry in corpus['Description']]
corpus.head()

Unnamed: 0,Genre,Description
0,Arts,"[bestselling, author, elizabeth, gilbert, retu..."
1,Video Games,"[the, leaders, in, gaming, news, hand-pick, th..."
2,Business News,"[the, tech, m, &, a, podcast, pulls, from, the..."
3,Management & Marketing,"[a, podcast, about, entrepreneurs, who, quit, ..."
4,Medicine,"[a, podcast, about, how, doctors, think, ., pr..."


In [4]:
# Remove stop words, non-numeric and perfom Word Stemming/Lemmenting.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(corpus['Description']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    corpus.loc[index,'description_final'] = str(Final_words)

In [5]:
# Save Models
NBmodel = joblib.load("NBmodel")
SVMmodel = joblib.load("SVMmodel")

In [6]:
# Predict the categories of the test data
NB_predicted_categories = NBmodel.predict(corpus['description_final'])
SVM_predicted_categories = SVMmodel.predict(corpus['description_final'])

In [7]:
print("Naive Bayes Accuracy Score -> ",accuracy_score(NB_predicted_categories, corpus['Genre'])*100)
print("SVM Accuracy Score -> ",accuracy_score(SVM_predicted_categories, corpus['Genre'])*100)

Naive Bayes Accuracy Score ->  48.16204051012753
SVM Accuracy Score ->  52.16304076019005


In [8]:
# Join the predictions to the corpus
corpus['NB prediction'] = NB_predicted_categories
corpus['SVM prediction'] = SVM_predicted_categories
corpus['SVM probability'] = [[max(i)] for i in SVMmodel.predict_proba(SVM_predicted_categories)]

In [9]:
corpus.head(50)

Unnamed: 0,Genre,Description,description_final,NB prediction,SVM prediction,SVM probability
0,Arts,"[bestselling, author, elizabeth, gilbert, retu...","['bestselling', 'author', 'elizabeth', 'gilber...",Careers,Literature,[0.1591386298538592]
1,Video Games,"[the, leaders, in, gaming, news, hand-pick, th...","['leader', 'game', 'news', 'day', 'hot', 'news...",Video Games,News & Politics,[0.9749585562910643]
2,Business News,"[the, tech, m, &, a, podcast, pulls, from, the...","['tech', 'podcast', 'pull', 'best', 'tech', 'm...",Gadgets,Gadgets,[0.07006283535605352]
3,Management & Marketing,"[a, podcast, about, entrepreneurs, who, quit, ...","['podcast', 'entrepreneur', 'quit', 'job', 'st...",Careers,Careers,[0.07006283535605352]
4,Medicine,"[a, podcast, about, how, doctors, think, ., pr...","['podcast', 'doctor', 'think', 'present', 'fig...",Medicine,Personal Journals,[0.34869915425887493]
5,Christianity,"[through, this, podcast, ,, john, and, lisa, b...","['podcast', 'john', 'lisa', 'bevere', 'hope', ...",Christianity,Christianity,[0.1132830671817752]
6,Comedy,"[the, official, podcast, of, comedians, ethan,...","['official', 'podcast', 'comedian', 'ethan', '...",Comedy,Comedy,[0.974925115510076]
7,Performing,"[an, old, time, radio, podcast, of, the, 1940s...","['old', 'time', 'radio', 'podcast', 'superman'...",Performing,Performing,[0.07000310398563517]
8,Medicine,"[ucsf, 's, osher, center, for, integrative, me...","['ucsf', 'osher', 'center', 'integrative', 'me...",Medicine,Medicine,[0.9532550037987703]
9,Language Courses,"[russian, teacher, daria, will, help, you, to,...","['russian', 'teacher', 'daria', 'help', 'impro...",Language Courses,Language Courses,[0.9981583490381192]
