In [1]:
import numpy as np
import pandas as pd
import re

#breakpoint()
## IMPORT DATA

# books
books = pd.read_csv('Datasets/Cleaned_Books.csv')
books = pd.DataFrame(books).rename(columns = {'BookTitle': 'Title'}).drop(columns = ['Unnamed: 0', 'Index', 'ID','Summary Length','Genre Count','Author'])

# movies
movies = pd.read_csv('Datasets/mpst_full_data.csv')
movies = pd.DataFrame(movies).rename(columns={'title': 'Title', 'plot_synopsis': 'Summary', 'tags': 'Genres'}).drop(columns = ['imdb_id', 'split', 'synopsis_source'])

# combine datasets
books_movies = pd.concat([books, movies], ignore_index=True)



## CLEAN DATA

def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = text.strip("""""")  # Remove punctuation
    text = text.replace("'", '')
    text = text.replace('"', '')
    text = text.strip('[]')
    text = text.replace(', ', ',')
    text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines
    return text.split(",") #list(text.split(", "))

books_movies["Genres"] = books_movies["Genres"].map(preprocess_text)



## INPUT FIRST RECOGNISED GENRE FROM GENRE LISTS

split_genres = pd.DataFrame(books_movies["Genres"].tolist()).fillna('z')

# basic list based on the top 30 genres by count
valid_genres = ['murder','violence', 'crime fiction', 'gothic',
                'speculative fiction','fantasy', 'science fiction', 'sci-fi','alternate reality',
                'romantic', 'romance novel', 'romantic, comedy',
                'suspenseful', 'suspense','mystery','revenge', 'tragedy', 'detective fiction',
                'novel', 'historical novel',
                'childrens literature', 'young adult literature',
                'comedy', 'satire', 'humor', 'entertaining','prank',
                'horror', 'tragedy', 'dark','sadist', 'cult', 'psychedelic', 'insanity', 'cruelty','paranormal',
                'action', 'neo noir', 'thriller', 'dramatic', 'adventure novel']


# go through all genres by column and reassign with the genre label that appears in the valid_genres list
tmp_list = []
for j in range(0,24):
    tmp_list.append(split_genres[j].apply(lambda x: ', '.join([i for i in valid_genres if i in x])))

# concat all the valid genres into one column and replace helper blanks with NANs, return first in new list that is not NaN
valid_genre_df = pd.concat(tmp_list, axis=1).replace(r'^\s*$', np.nan, regex=True).fillna(method='bfill', axis=1).iloc[:, 0]

# concat updated list of genres with original book list
books_movies_genre = pd.concat([books_movies, valid_genre_df],axis=1).rename(columns={0:"Genre_new"})



## REPLACE GENRES WITH GENRE CATEGORY

# dictionary with genres
valid_genres_dict = {'Crime': ['murder','violence', 'crime fiction', 'gothic'],
                'Fantasy': ['speculative fiction','fantasy', 'science fiction', 'sci-fi','alternate reality'],
                'Romance': ['romantic', 'romance novel', 'romance novel, novel', 'romantic, comedy'],
                'Mystery': ['suspenseful', 'suspense','mystery', 'suspenseful, suspense','revenge', 'tragedy', 'detective fiction'],
                'Novel': ['novel', 'historical novel', 'novel, historical novel'],
                'Childrens Literature': ['childrens literature','young adult literature'],
                'Comedy': ['comedy', 'satire', 'humor', 'entertaining','prank'],
                'Horror': ['horror', 'tragedy', 'dark','sadist', 'cult', 'psychedelic', 'insanity', 'cruelty','paranormal'],
                'Action': ['action', 'neo noir', 'thriller', 'dramatic', 'adventure novel']}


# function to lookup an item in a dictionary values list, and return the dictionary key

def return_genre_cat(x):
    results = [k if x == v or x in v else None for k, v in valid_genres_dict.items()]
    tmp_result = [i for i in results if i is not None]
    tmp_result = tmp_result[0] if len(tmp_result) > 0 else np.nan
    return tmp_result


# replace the specific genres with genre category
books_movies_genre['Genre_Grp'] = books_movies_genre['Genre_new'].apply(return_genre_cat)


## Clean Summary Text

# function to clean the summary text
def clean_text(text):
    """
    - remove any html tags (< /br> often found)
    - Keep only ASCII + Latin chars, digits and whitespaces
    - pad punctuation chars with whitespace
    - convert all whitespaces (tabs etc.) to single wspace
    """
    RE_PUNCTUATION = re.compile("([!?.,;-])")
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!?0-9 ]", re.IGNORECASE)
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_PUNCTUATION, r" \1 ", text)
    text = re.sub(RE_WSPACE, " ", text)
    return text


# Clean Comments. Only keep long enough
books_movies_genre['Summary_clean'] = books_movies_genre.loc[books_movies_genre['Summary'].str.len() > 10, "Summary"]
books_movies_genre['Summary_clean'] = books_movies_genre["Summary_clean"].apply(clean_text)
books_movies_genre_short = books_movies_genre[books_movies_genre["Summary_clean"].str.len() <=3000]


# drop blank genres
books_movies_genre_short = books_movies_genre_short.dropna(how = 'all', subset='Genre_new')


## PUSH TO CSV

books_movies_genre_short[['Title', 'Summary_clean', 'Genre_Grp']].to_csv("Datasets/book_movies_final.csv", index=False)


In [24]:
books_movies_genre

Unnamed: 0,Title,Summary,Genres,Genre_new,Genre_Grp,Summary_clean
0,Animal Farm,"Old Major, the old boar on the Manor Farm, ca...","[roman à clef, satire, childrens literature, s...",satire,Comedy,"Old Major , the old boar on the Manor Farm , ..."
1,A Clockwork Orange,"Alex, a teenager living in near-future Englan...","[science fiction, novella, speculative fiction...",science fiction,Fantasy,"Alex , a teenager living in near future Engla..."
2,The Plague,The text of The Plague is divided into five p...,"[existentialism, fiction, absurdist fiction, n...",novel,Novel,The text of The Plague is divided into five p...
3,A Fire Upon the Deep,The novel posits that space around the Milky ...,"[hard science fiction, science fiction, specul...",science fiction,Fantasy,The novel posits that space around the Milky ...
4,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...","[war novel, roman à clef]",novel,Novel,"The book tells the story of Paul Bäumer , a G..."
...,...,...,...,...,...,...
27664,Lucky Numbers,"In 1988 Russ Richards (John Travolta), the wea...","[comedy, murder]",comedy,Comedy,"In 1988 Russ Richards John Travolta , the weat..."
27665,Iron Man 2,"In Russia, the media covers Tony Stark's discl...","[good versus evil, violence]",violence,Crime,"In Russia , the media covers Tony Stark s disc..."
27666,Play Dirty,During the North African Campaign in World War...,[anti war],,,During the North African Campaign in World War...
27667,High Wall,Steven Kenet catches his unfaithful wife in th...,[murder],murder,Crime,Steven Kenet catches his unfaithful wife in th...


In [2]:
books_movies_genre_short

Unnamed: 0,Title,Summary,Genres,Genre_new,Genre_Grp,Summary_clean
6,Blade Runner 3: Replicant Night,"Living on Mars, Deckard is acting as a consul...","[science fiction, speculative fiction]",science fiction,Fantasy,"Living on Mars , Deckard is acting as a consu..."
7,Blade Runner 2: The Edge of Human,Beginning several months after the events in ...,"[science fiction, speculative fiction]",science fiction,Fantasy,Beginning several months after the events in ...
9,Crash,The story is told through the eyes of narrato...,"[speculative fiction, fiction, novel]",speculative fiction,Fantasy,The story is told through the eyes of narrato...
24,Farmer Giles of Ham,Farmer Giles (Ægidius Ahenobarbus Julius Agri...,"[fantasy, fiction]",fantasy,Fantasy,Farmer Giles Ægidius Ahenobarbus Julius Agric...
25,Gaudy Night,Harriet Vane returns reluctantly to Oxford to...,"[mystery, detective fiction, novel, fiction, s...",mystery,Mystery,Harriet Vane returns reluctantly to Oxford to...
...,...,...,...,...,...,...
27657,Thunderheart,"During the early 1970s, FBI agent Ray Levoi is...","[suspenseful, murder, flashback]","suspenseful, suspense",Mystery,"During the early 1970s , FBI agent Ray Levoi i..."
27659,The Prisoner of Zenda,"On his deathbed, the king of Ruritania announc...",[action],action,Action,"On his deathbed , the king of Ruritania announ..."
27662,One Night of Love,Opera singer Mary Barrett (Grace Moore) leaves...,[romantic],romantic,Romance,Opera singer Mary Barrett Grace Moore leaves t...
27664,Lucky Numbers,"In 1988 Russ Richards (John Travolta), the wea...","[comedy, murder]",comedy,Comedy,"In 1988 Russ Richards John Travolta , the weat..."


In [53]:
import numpy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv("Datasets/book_movies_final.csv", sep=",", encoding="latin-1")


In [14]:
df.drop(columns = ['Title'])

In [76]:
X = books_movies_genre['Summary_clean']
y = books_movies_genre['Genres']

In [77]:
model = CountVectorizer(ngram_range=(1,2))

In [78]:
model.fit(y_train)

In [79]:
model.get_feature_names()

['genres']

In [59]:

pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1,2))),
    ('transformer',    MultiLabelBinarizer()),
    ('classifier',         MultinomialNB())
])

# k_fold = KFold(n=len(df), n_folds=6, shuffle=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [61]:

# for train_indices, test_indices in k_fold:
#     train_text = data.iloc[train_indices]['text'].values
#     train_y = data.iloc[train_indices]['class'].values.astype(str)

#     test_text = data.iloc[test_indices]['text'].values
#     test_y = data.iloc[test_indices]['class'].values.astype(str)

pipeline.fit(y_train)
# predictions = pipeline.predict(X_test)
# confusion = confusion_matrix(y_test, predictions)

# accuracy = accuracy_score(y_test, predictions)

# print(accuracy)


TypeError: MultiLabelBinarizer.fit_transform() takes 2 positional arguments but 3 were given