# Labels (genres) EDA

In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import os

In [None]:
df = pd.read_csv('../data/dataset.csv')
print(df)


In [None]:
genres = df['genres'].copy()
print(genres)

In [None]:
genres[genres==genres[3]]

In [None]:
print('There are ', str(len(genres[genres==genres[3]])), 'unlabeled books.')

In [None]:
# genres of each book are represented as a string representation of list, so it needs to be converted to list
unique_genres_unfiltered = set()
for index, value in genres.items():
    genres_lowercase = [genre.lower() for genre in ast.literal_eval(value)]
    genres[index] = genres_lowercase
    unique_genres_unfiltered.update(genres_lowercase)

In [None]:
print('There are ', str(len(unique_genres_unfiltered)), 'book genres in the data set.')

In [None]:
# Counting the number of books of each genre
def GetGenresFrequencies(unique_genres, genres):
    """
    Input: 
        unique_genres: list/set of the possible genres
        genres: labels(genres) of the books in the data set
    
    Output: Sorted dictionary of the book genres(keys) and their frequencies(values) in the data set
    """
    genre_freq= {}
    for genre in unique_genres:
        for index, genres_list in genres.items():
            genre_count = genres_list.count(genre)
            if genre not in genre_freq:
                genre_freq[genre] = genre_count
            else:
                genre_freq[genre] = genre_freq.get(genre) + genre_count  
                
    # Sort the frequency of the genres dictionart, such that most frequent ones appear first
    genre_freq_sorted = dict(sorted(genre_freq.items(), key=lambda item: item[1], reverse=True))

    return genre_freq_sorted

In [None]:
genre_freq_unfiltered = GetGenresFrequencies(unique_genres_unfiltered, genres)
print(genre_freq_unfiltered)

In [None]:
def PrintBooksOfCertainGenre(genre):
    """
    Input: genre: type - string, specifies the genre for the book query
    """
    for index, genres_list in genres.items():
        if genres_list.count(genre)>0:
            print(df.loc[df.index[index], 'name'])

In [None]:
def PrintAllGenresOfBookWihCertainGenre(genre):
    """
    Input: genre - type:string, specifies the genre for the book query
    """
    for index, genres_list in genres.items():
        if genres_list.count(genre)>0:
            print(df.loc[df.index[index], ['name', 'genres']])

# Cleaning the data set (based on labels)

In [None]:
# Drop the first column, as it contain only indices
df.drop(labels=list(df)[0],axis=1, inplace = True)

In [None]:
# Set all lebels in the dataframe to the lowercase
for index in df.index:
    genres_lowercase = [genre.lower() for genre in ast.literal_eval(df.at[index,'genres'])]
    df.at[index,'genres'] = genres_lowercase

In [None]:
# Drop books that are not labeled
df = df[df['genres'].map(lambda d: len(d)) > 0]
df.reset_index(drop=True, inplace=True)

In [None]:
df

## Label filtering and mapping 

The most unfrequent and the broadest genres (which would make the data set extremely imbalanced) are dropped, and the rest are grouped around the broader genres.

In [None]:
genre_mapping = {
                 'science fiction':'science fiction',
                 'dystopia':'science fiction', 
                 'apocalyptic and post-apocalyptic fiction':'science fiction',
                 'steampunk':'science fiction',
                 'feminist science fiction':'science fiction',
                 'cyberpunk':'science fiction',
                 'military science fiction':'science fiction',
                 'time travel':'science fiction',
                 'hard science fiction':'science fiction',
                
                 'fantasy':'fantasy',
                 'high fantasy':'fantasy',
                 #'speculative fiction':'fantasy',
                 'urban fantasy':'fantasy',
                 'fantasy of manners':'fantasy',
                 'contemporary fantasy':'fantasy',
                 'fairy tale':'fantasy',
                 'science fantasy':'fantasy',
                 'lost world':'fantasy',
                 'historical fantasy':'fantasy',
                 'sword and sorcery':'fantasy',
                 'dark fantasy':'fantasy',
                 
                 'mystery':'mystery',
                 
                 'thriller':'thriller',
                 'suspense':'thriller',
                 'techno-triller':'thriller',
                 
                 'true crime':'crime',
                 'crime fiction':'crime',
                 'detective fiction':'crime',
                 'spy fiction':'crime', 
                 'historical whodunnit':'crime',
                 'whodunit':'crime',
                 'hardboiled':'crime',
                 
                 'historical novel':'historical', 
                 'historical fiction':'historical',
                 'war novel':'historical',
                 'post-holocaust':'historical',
                 'wuxia':'historical',
                 
                 'horror':'horror',
                 'gothic fiction':'horror', 
                 'vampire fiction':'horror',
                 
                 'romance novel':'romance', 
                 'paranormal romance':'romance',
                 'historical romance':'romance',
                 'regency romance':'romance',
                 'scientific romance':'romance', 
                 'chivalric romance':'romance',
                 
                 'adventure':'adventure',
                 'adventure novel':'adventure',
                 
                 'non-fiction':'non-fiction',
                 'biography':'non-fiction', 
                 'autobiography':'non-fiction',
                 'philosophy':'non-fiction',
                 'personal journal':'non-fiction',
                 'sports, Popular science':'non-fiction',
                 'travel literature':'non-fiction',
                 'mathematics':'non-fiction',
                 'economics':'non-fiction',
                 'politics':'non-fiction', 
                 'business':'non-fiction',
                 'sociology':'non-fiction',
                 'psychology':'non-fiction',
                 'travel':'non-fiction',
                 'psychology':'non-fiction',
                 'science':'non-fiction',
                 
                 'comedy':'humor',
                 'tragicomedy':'humor',
                 'satire':'humor',
                 'comic novel':'humor',
                 'black comedy':'humor',
                 'comics':'humor',
                 'comic science fiction':'humor', 
                 'parody':'humor',
                 
                 'novel':'realistic fiction',
                 'literary realism':'realistic fiction',
                 'industrial novel':'realistic fiction',
                 'social novel':'realistic fiction',
                 'psychological novel':'realistic fiction',
                 'roman à clef':'realistic fiction',
                 
                 'western':'western',
                 'western fiction': 'western',
                 
                 'bildungsroman':'coming of age', 
                 'künstlerroman':'coming of age',
                 
                 "children's literature":"children's literature"
                
                }

In [None]:
# Map the genres as specified in the genre_mapping

for index, value in df['genres'].items():
    l_new = list()
    for elem in value:
        l_new.append(genre_mapping.get(elem))
    df.at[index,'genres'] = l_new

In [None]:
for idx, gnr in df['genres'].items():
    
    # Checking if there are some duplicated labels (as multiple labels from the original data set could have been
    # mapped to one label) for the book and dropping them
    non_duplicate_genres = list(set(gnr))
    df.at[idx,'genres'] = non_duplicate_genres

    # Cleaning None values in the genre lists
    df.at[idx,'genres'] = [i for i in non_duplicate_genres if i is not None]
        
    
# Dropping books that are left without label after the processing above
df = df[df['genres'].map(lambda d: len(d)) > 0]
df.reset_index(drop=True, inplace=True)

In [None]:
# Saving the modified data set
os.chdir('..')
df.to_csv('./data/dataset_filtered_labels.csv')

In [None]:
unique_genres = set()
for index, value in df['genres'].items():
    unique_genres.update(value)
print('There are ', str(len(unique_genres)), 'book genres in the data set.')

In [None]:
genre_freq = GetGenresFrequencies(unique_genres, df['genres'])
print(genre_freq)

## Visualisations

In [None]:
plt.bar(range(len(genre_freq)), list(genre_freq.values()), align='center')
plt.xticks(range(len(genre_freq)), list(genre_freq.keys()))
plt.xticks(rotation=70)
plt.title('Genres frequency among books')
plt.xlabel('Genres')
plt.ylabel('Number of books')
plt.show()

**Conclusion**: The dataset is rather imbalanced, which should be taken into account when implementing the models.