### This file creates the GenreDataFrame class in order to hold some of the cleaning steps that take large dictionaries

In [3]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support 
import matplotlib.pyplot as plt
from scipy.sparse import identity
import cleaning
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
nlp = spacy.load('en_core_web_sm')

In [1]:
class GenreDataFrame:
    ficnonfic_dict = {'Anthology':'Fiction', 'Comic book':'Fiction', 'Comics':'Fiction','Epistolary novel':'Fiction',
                      'Ergodic literature':'Fiction', 'Experimental literature':'Fiction','Fiction':'Fiction',
                      'First-person narrative':'Fiction', 'Gamebook':'Fiction', 'Graphic novel':'Fiction',
                      'Non-fiction novel':'Fiction', 'Novel':'Fiction', 'Novella':'Fiction','Parallel novel':'Fiction',
                      'Picture book':'Fiction', 'Play':'Fiction', 'Poetry':'Fiction', 'Prose':'Fiction','Prose poetry':'Fiction',
                      'Religious text':'Fiction', 'Role-playing game':'Fiction', 'Serial':'Fiction', 'Short story':'Fiction',
                      'Anthropology':'Non-fiction', 'Autobiographical novel':'Non-fiction', 'Autobiography':'Non-fiction', 
                      'Biographical novel':'Non-fiction', 'Biography':'Non-fiction', 'Business':'Non-fiction',
                      'Computer Science':'Non-fiction', 'Creative nonfiction':'Non-fiction', 'Economics':'Non-fiction',
                      'Essay':'Non-fiction', 'Literary criticism':'Non-fiction', 'Mathematics':'Non-fiction',
                      'Memoir':'Non-fiction', 'Neuroscience':'Non-fiction', 'Non-fiction':'Non-fiction', 
                      'Personal journal':'Non-fiction', 'Philosophy':'Non-fiction', 'Photography':'Non-fiction',
                      'Political philosophy':'Non-fiction', 'Popular culture':'Non-fiction', 'Popular science':'Non-fiction',
                      'Science':'Non-fiction', 'Sociology':'Non-fiction', 'Spirituality':'Non-fiction',
                      'Sports':'Non-fiction', 'Travel literature':'Non-fiction', 'Treatise':'Non-fiction'}
    com_genre_dict = {'Adventure':'Adventure','Adventure novel':'Adventure', 'Naval Adventure':'Adventure',
                      'Alternate history':'Alternate history',
                      "Children's literature":"Children's literature","Boys' school stories":"Children's literature",
                      'Juvenile fantasy':"Children's literature",
                      'Comedy':'Comedy', 'Satire':'Comedy','Farce':'Comedy','Comedy of manners':'Comedy','Parody':'Comedy',
                      'Tragic comedy':'Comedy','Black comedy':'Comedy','Comic novel':'Comedy',
                      'Crime fiction':'Crime fiction', 'Cozy':'Crime fiction',
                      'Detective fiction':'Detective fiction', 'Hardboiled':'Detective fiction',
                      'Police procedural':'Detective fiction',
                      'Spy fiction':'Spy fiction','Cabal':'Spy fiction',
                      'Fantasy':'Fantasy','Bangsian fantasy':'Fantasy','Comic fantasy':'Fantasy','Contemporary fantasy':'Fantasy',
                      'Dark fantasy':'Fantasy','Fantasy of manners':'Fantasy','Heroic fantasy':'Fantasy','High fantasy':'Fantasy',
                      'Historical fantasy':'Fantasy','Low fantasy':'Fantasy','Magic realism':'Fantasy','Sword and sorcery':'Fantasy',
                      'Urban fantasy':'Fantasy',
                      'Historical fiction':'Historical fiction','Historical novel':'Historical fiction',
                      'Historical whodunnit':'Historical fiction','Wuxia':'Historical fiction',
                      'Horror':'Horror','American Gothic Fiction':'Horror','Gothic fiction':'Horror','Vampire fiction':'Horror',
                      'Zombie':'Horror','Zombies in popular culture':'Horror',
                      'Mystery':'Mystery','Whodunit':'Mystery','Locked room mystery':'Mystery',
                      'Romance novel':'Romance','Chivalric romance':'Romance','Colonial United States romance':'Romance',
                      'Elizabthan romance':'Romance','Georgian romance':'Romance','Historical romance':'Romance',
                      'Medieval romance':'Romance','Regency romance':'Romance','Romantic comedy':'Romance',
                      'Paranormal romance':'Romance',
                      'Science Fiction':'Science fiction','Alien invasion':'Science fiction','Comic science fiction':'Science fiction',
                      'Cyberpunk':'Science fiction','Dying Earth subgenre':'Science fiction','Edisonade':'Science fiction',
                      'Feminist science ficiton':'Science fiction','Future history':'Science fiction','Hard science fiction':'Science fiction',
                      'Human extinction':'Science fiction','Invasion literature':'Science fiction','Military science fiction':'Science fiction',
                      'Planetary romance':'Science fiction','Postcyberpunk':'Science fiction','Scientific romance':'Science fiction',
                      'Social science fiction':'Science fiction','Soft science fiction':'Science fiction','Space opera':'Science fiction',
                      'Sword and planet':'Science fiction','Time travel':'Science fiction',
                      'Apocalyptic and post-apocalyptic fiction':'Science fiction','Biopunk':'Science fiction','Steampunk':'Science fiction',
                      'Subterranean fiction':'Science fiction',
                      'Speculative fiction':'Speculative fiction','Dystopia':'Speculative fiction','Superhero fiction':'Speculative fiction',
                      'Supernatural':'Speculative fiction','Ghost story':'Speculative fiction','Utopian and dystopian fiction':'Speculative fiction',
                      'Utopian fiction':'Speculative fiction',
                      'Spy fiction':'Spy fiction','Cabal':'Spy fiction',
                      'Suspense':'Suspense',
                      'Thriller':'Thriller','Techno-thriller':'Thriller','Conspiracy':'Thriller','Conspiracy fiction':'Thriller',
                      'Young adult literature':'Young adult literature', 'Bildungsroman':'Young adult literature',
                      'Coming of age':'Young adult literature','English public-school stories':'Young adult literature',
                      'School story':'Young adult literature','Light novel':'Young adult literature'}
    
    
    def __init__(self,df):
        self.df = df
    
    def remove_names(self):
        summ_nlp = [nlp(i) for i in self.df['Plot Summary']]
        def token_match_drop(doc):
            indexes = []
            for index, token in enumerate(doc):
                if (token.pos_ in ('PUNCT','SPACE','PROPN')):
                    indexes.append(index)
            np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA])
            np_array = np.delete(np_array, indexes, axis=0)
            doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
            doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA], np_array)
            return doc2.text
        self.df['nlpSumm'] = [token_match_drop(i) for i in summ_nlp]
        return self
    
    def sep_genres(self):
        self.df['Sep Genres'] = self.df['Genres'].str.findall('[A-Z]\w.+?(?="|\\\\)')
        return self
    
    def single_row(self):
        single_row = self.df.drop(['Wiki ID','Freebase ID','Pub Date','Genres'], axis=1)
        single_row = single_row['Sep Genres'].apply(pd.Series).merge(single_row, left_index=True, right_index=True)
        single_row = single_row.drop(['Sep Genres'], axis=1)
        single_row = single_row.melt(id_vars = ['Title','Author','Plot Summary','finalSumm'], value_name = 'genre')
        single_row = single_row.drop('variable', axis=1)
        single_row = single_row.dropna()
        return single_row