# Dataset Preparation

The following:
1. Import of necessary packages and loading the dataset
2. Author and Title text cleaning
3. Google API
4. Gender Detection
5. Open Library API
6. Cleaning the ratings data
7. Thresholds
8. Cornac preparations

For this project i'm using the book-crossing data set obtained from http://www2.informatik.uni-freiburg.de/~cziegler/BX/.

# 1. Import of necessary packages and the dataset

In [1]:
import urllib.request
from tqdm import tqdm
from bs4 import BeautifulSoup
import spacy
import requests
import json
import pandas as pd
import gender_guesser.detector as gender
import re
import numpy as np
import pickle
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

In [2]:
books = pd.read_csv('book-crossing/Books.csv', sep=';', names = ['isbn', 'title', 'author', 'year', 'publisher'],  header=0)
ratings = pd.read_csv('book-crossing/Ratings.csv', sep=';', names = ['user_id', 'isbn', 'rating'],  header=0)

# 2. Author and Title text cleaning

In [3]:
books = books.astype({"author": str, "title": str, 'isbn': str, 'publisher': str}, errors='raise')

In [None]:
def clean(text):
    
    #removal of html tags
    text = re.sub(r'<.*?>',' ',text) 
    
    #remove newline char
    text = re.sub('\n', '', text)
    
    #remove punctuation
    text = text.replace('-', ' ')
    text = text.replace(',', ' ')
    text = text.replace(':', ' ')
    text = text.replace('?', ' ')
    text = text.replace('!', ' ')

    #remove double and triple spaces
    text = text.replace('  ', ' ')
    text = text.replace('   ', ' ')

    #to lower case
    text = text.lower()
    
    #other
    text = text.replace('©', ' ')

    return text

def clean_author(text):
    
    # remove phd's and masters
    text = text.replace('m. ph. d.', ' ')
    text = text.replace('ph. d.', ' ')
    text = text.replace('m ph. d ', ' ')
    text = text.replace('ph. d ', ' ')
    text = text.replace('ph d.', ' ')
    text = text.replace('phd ', ' ')
    
    #remove A.B mistakes
    text = re.sub(r'(?<=\S)\.(?=\w)', '. ', text)

    #remove J Amando mistake
    split = text.split()
    if len(split[0]) == 1:
        text = split[0] + '. ' + ''.join(split[1:])

    #remove Michael. Kilian mistake
    split = text.split()
    if len(split[0]) > 2 and split[0][-1] == '.':
        text = split[0].replace('.', ' ') + ''.join(split[1:])
    
    #remove double and triple spaces
    text = text.replace('  ', ' ')
    text = text.replace('   ', ' ')
    
    #fix not applicable mistake
    if 'not applicable' in text:
        text = None
    
    return text

def clean_title(text):
    text = text.replace('\\', ' ')
    text = text.replace('/', ' ')
    text = text.replace('"', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('.', ' ')
    text = text.replace('  ', ' ')
    return text

In [None]:
books.title = books.title.apply(lambda x: clean(x))
books.title = books.title.apply(lambda x: clean_title(x))
books.author = books.author.apply(lambda x: clean(x))
books.author = books.author.apply(lambda x: clean_author(x))

In [None]:
books = books.drop_duplicates(subset='isbn', ignore_index=True) # remove one isbn duplicate
books = books.reset_index(drop = True)
books.to_csv('books.csv', index=False)

# 2. Google API
The code below is just to show how the Google API works. The actual code run for gaining the data is in 'code/google_api.py'

In [None]:
def google_api(isbn):

    base_api_link = "https://www.googleapis.com/books/v1/volumes?q=isbn:"
    with urllib.request.urlopen(base_api_link + isbn) as f:
        text = f.read()

    decoded_text = text.decode("utf-8")
    obj = json.loads(decoded_text)  # deserializes decoded_text to a Python object
    
    if "items" in obj.keys():
        volume_info = obj["items"][0]["volumeInfo"]

        author = volume_info["authors"][0] if "authors" in volume_info.keys() else None

        return author
    
    return None

In [None]:
google_api('0679425608') # example

In [None]:
#check nan values

print('Percentage of data')
print('author:', books.author_google.notna().sum()/len(books))

## 2.1. Correcting author names according to google 

If first name of author is abbreviated, see if we can match first letter and last name.
3482 author data was changed which is 1.28%.

In [None]:
for index_label, row_series in books.iterrows():
    if pd.isna(row_series.author):
        books.at[index_label, 'author_corrected'] = books.at[index_label, 'author_google'] if not(pd.isna(books.at[index_label, 'author_google'])) else None
    else:
        author = str(row_series.author)
        author_google = str(row_series.author_google)
        if re.match('\w\.', author.split()[0]): # starts with an abbreviation
            if author[0] == author_google[0]: # same first letter
                if author.split()[-1] == author_google.split()[-1]: # same last name
                    if re.match('\w\.', author_google.split()[0]): # starts with an abbreviation
                        pass
                    else:
                        books.at[index_label, 'author_corrected'] = books.at[index_label, 'author_google'] if not(pd.isna(books.at[index_label, 'author_google'])) else None

In [None]:
len(books[books['author_corrected'].notna()])/len(books)

In [None]:
# adds the original name if no new data comes from google
for index_label, row_series in books.iterrows(): 
    if pd.isna(books.at[index_label, 'author_corrected']):
         books.at[index_label, 'author_corrected'] = books.at[index_label, 'author']

In [None]:
# authors left with an abbreviation for first name
len(books[books.author.str.contains('^\w. ', regex=True, na=False)])/len(books) 

# 4. Gender Detection

In [None]:
books = pd.read_csv('books.csv')

## 4.1. First name extraction

In [None]:
# get first word in author that is not an abbreviation
for index_label, row_series in books.iterrows():
    books.at[index_label, 'name'] = str(row_series.author_corrected).split()[0] if not re.match('\w\.', str(row_series.author_corrected).split()[0]) else None

In [None]:
# check how many names exist
sum(books.name.isna())/len(books)

## 4.2. Apply Gender Guesser

Again, code below is just to show how the Gender Guesser works. The actual code run for gaining the data is in 'code/gender_guesser_api.py'

In [None]:
d = gender.Detector()
name = 'Rosa'
d.get_gender(name)

In [None]:
books.gender.value_counts()

In [None]:
books.gender.value_counts(normalize=True)

## 4.3. Unifying gender across book_codes

In [None]:
book_to_gender = {}

for code in tqdm(books.book_code.unique()):
    genders = books.gender[books.book_code ==code].value_counts().rename_axis('gender').reset_index(name='count')
    gender = genders.gender[0]
    if gender == 'unknown':
        gender = genders.gender[1] if len(genders)>1 else 'unknown'     
    book_to_gender[code] = gender

with open('book_to_gender.json', 'w') as json_file:
    json.dump(book_to_gender, json_file)

In [None]:
for index_label, row_series in book.iterrows():
    books.at[index_label, 'gender']  = book_to_gender[row_series.book_code] 

# 5. Open Library API

To get information about book and author keys for item linking. Again, the code below is just for presentation the actual code is in 'code/ol_api.py'

In [None]:
def ol_api(isbn):
    
    base_api_link = 'https://openlibrary.org/isbn/'
    with urllib.request.urlopen(base_api_link + isbn + '.json') as f:
        text = f.read()

    decoded_text = text.decode("utf-8")
    obj = json.loads(decoded_text)  # deserializes decoded_text to a Python object
    print(obj)
    author_key = obj["authors"][0]['key'].split("authors/", 1)[1] if "authors" in obj.keys() else None
    book_key = obj["works"][0]['key'].split("works/", 1)[1] if "works" in obj.keys() else None
    
    return author_key, book_key

In [None]:
print(ol_api('0446524484'))

In [None]:
print('Percentage of data')
print('author_code:', books.author_key_ol.notna().sum()/len(books))
print('book_code:', books.book_key_ol.notna().sum()/len(books))

In [None]:
len(books.book_key_ol.unique())

## 5.1. Book matching using ol book key

In [None]:
unique_books = {}
count = 0

for index_label, row_series in books.iterrows():

    item = books.at[index_label, 'book_key_ol']
    
    if pd.isna(item):
        unique_books[item] = str(int(count))
        books.at[index_label, 'book_code'] = unique_books[item]
        count += 1
        
    else:
        if item in unique_books.keys():
            books.at[index_label, 'book_code'] = unique_books[item]
        else:
            unique_books[item] = str(int(count))
            books.at[index_label, 'book_code'] = unique_books[item]
            count += 1

# 6. Cleaning the ratings data

In [None]:
print(len(ratings))
print(len(ratings.isbn.unique()))
print(len(ratings.user_id.unique()))

In [None]:
ratings = ratings[ratings.isbn.isin(list(books.isbn))]

In [None]:
1 - len(ratings[ratings.isbn.isin(list(books.isbn))])/len(ratings)

In [None]:
print(len(ratings))
print(len(ratings.isbn.unique()))
print(len(ratings.user_id.unique()))

# 7. Updating the ratings data

## 7.1. Updating the book code

In [None]:
isbn_to_code = {}

for index_label, row_series in books.iterrows():
    
    isbn = books.at[index_label, 'isbn']
    code = books.at[index_label, 'book_code']
    
    isbn_to_code[isbn] = code
    
for index_label, row_series in ratings.iterrows():
    ratings.at[index_label, 'book_code']  = isbn_to_code[row_series.isbn] if row_series.isbn in isbn_to_code.keys() else None

### 7.1.1. Removing book_code duplicates

In [None]:
sum(ratings.duplicated(['user_id', 'book_code']))/len(ratings)

In [None]:
ratings = ratings.drop_duplicates(['user_id', 'book_code'])

In [None]:
sum(ratings.duplicated(['user_id', 'book_code']))/len(ratings)

## 7.2. Updating author gender

In [None]:
f = open('code/book_to_gender.json')
book_to_gender = json.load(f)

In [None]:
for index_label, row_series in ratings.iterrows():
    ratings.at[index_label, 'gender']  = book_to_gender[row_series.book_code]

### 7.2.1. Redefining author gender

Changing mostly female/male, andy to unknown.

In [None]:
gender_dict = {'mostly_female': 'unknown', 'mostly_male': 'unknown', 'andy': 'unknown'}
ratings.gender = ratings.gender.replace(gender_dict)

In [None]:
ratings.gender.value_counts()

In [None]:
ratings.gender.value_counts(normalize=True)

## 7.3. Dropping unknown genders

In [None]:
ratings = ratings[ratings.gender!='unknown']

In [None]:
print(len(ratings))
print(len(ratings.isbn.unique()))
print(len(ratings.user_id.unique()))
print(len(ratings.book_code.unique()))
print(ratings.gender.value_counts())
print(ratings.gender.value_counts(normalize=True))

In [None]:
ratings.to_csv('ratings-cleaned.csv', index=False)

## 7.4 Dropping explicit ratings

In [None]:
ratings = ratings[ratings.rating == 0] #implicit

# 8.0 Thresholds etc

In [None]:
item_threshold = 10 # remove users with less than item_threshold items
user_threshold = 10 # remove items with less than user_threshold users
top_threshold = 200 # remove users who have rated more than top_threshold items

In [None]:
def user_distribution(df_events, user_col='user_id', prnt = False):
    user_dist = df_events[user_col].value_counts() 
    num_users = len(user_dist)
    if prnt:
        print('Mean books per user: ' + str(np.round(user_dist.mean(),1))) 
        print('Min books per user: ' + str(np.round(user_dist.min(),1))) 
        print('Max books per user: ' + str(np.round(user_dist.max(),1)))
    return user_dist, num_users

def user_gender_distribution(df, user_col='user_id'):
    
    user_dict = {}
    for user in df[user_col].unique():
        user_df = df[df[user_col] == user]
        num_female = len(user_df[user_df.gender=="female"])
        num_male = len(user_df[user_df.gender=="male"])
        user_dict[user] = [num_female, num_male]
        
    user_gender_dist = pd.DataFrame.from_dict(user_dict, orient="index",columns=["num_female", "num_male"])
    user_gender_dist["num_total"] = user_gender_dist["num_female"] + user_gender_dist["num_male"] 
    user_gender_dist["ratio_female"] = user_gender_dist["num_female"]/user_gender_dist["num_total"] 
    user_gender_dist["male_female_difference"] = (user_gender_dist["num_male"] - user_gender_dist["num_female"])/user_gender_dist["num_total"] 
    num_users = len(user_gender_dist)
    return user_gender_dist, num_users

def item_distribution(df_events, user_col='user_id', prnt = False):
    item_dist = df_events['book_code'].value_counts()
    num_items = len(item_dist)
    if prnt:
        print('Mean users per book: ' + str(np.round(item_dist.mean(),1))) 
        print('Min users per book: ' + str(np.round(item_dist.min(),1))) 
        print('Max users per book: ' + str(np.round(item_dist.max(),1))) 
    return item_dist, num_items

def isbn_distribution(df_events, user_col='user_id', prnt = False):
    item_dist = df_events['isbn'].value_counts()
    num_items = len(item_dist)
    if prnt:
        print('Mean users per book: ' + str(np.round(item_dist.mean(),1))) 
        print('Min users per book: ' + str(np.round(item_dist.min(),1))) 
        print('Max users per book: ' + str(np.round(item_dist.max(),1))) 

In [None]:
print('\nBefore changes.')
user_dist, num_users = user_distribution(ratings, prnt = True) # create dataset for all users 
item_dist, num_items = item_distribution(ratings, prnt = True) # create dataset for all items

In [None]:
print('\nAfter removing top_threshold.')
user_dist = user_dist[user_dist < top_threshold] # remove all users with less than top_threshold items
ratings = ratings[ratings.user_id.isin(user_dist.index)].reset_index().drop("index", axis=1) # remove these users from the ratings set
user_dist, num_users = user_distribution(ratings,  prnt=True) # update dataset for all users 
item_dist, num_items = item_distribution(ratings,  prnt=True) # update dataset for all items 

In [None]:
# remove items and users iteratively until there are only users with at least item_threshold items and items with at least user_threshold users.
while item_dist.iloc[-1] < user_threshold  or user_dist.iloc[-1] < item_threshold:
    item_dist = item_dist[item_dist >= user_threshold]
    user_dist = user_dist[user_dist >= item_threshold]
    ratings = ratings[ratings.user_id.isin(user_dist.index)].reset_index().drop("index", axis=1)
    ratings = ratings[ratings['book_code'].isin(item_dist.index)].reset_index().drop("index", axis=1)
    user_dist, num_users = user_distribution(ratings) # update dataset for all users 
    item_dist, num_items = item_distribution(ratings) # update dataset for all items
    
print('\nAfter removing item and user thresholds.')
user_dist, num_users = user_distribution(ratings, prnt=True) # final update dataset for all users 
item_dist, num_items = item_distribution(ratings, prnt=True) # final update dataset for all items

In [None]:
isbn_distribution(ratings, prnt = True) # to check how much book linking has helped

In [None]:
print(len(ratings))
print(len(ratings.isbn.unique()))
print(len(ratings.user_id.unique()))
print(len(ratings.book_code.unique()))
print(ratings.gender.value_counts())
print(ratings.gender.value_counts(normalize=True))

In [None]:
ratings.to_csv('ratings.csv', index=False)

# 10. Prepare for the recommdation system

In [None]:
protected = ratings.book_code[ratings.gender == 'female']

In [None]:
with open("protected", "wb") as fp:   #Pickling
    pickle.dump(protected, fp)
    
#with open("protected", "rb") as fp:   # Unpickling
#    protected = pickle.load(fp)

In [None]:
ratings = ratings[['user_id', 'book_code', 'rating']]
ratings.rating = [1]* len(ratings)
ratings.to_csv('data.csv', index=False)

# 11. Gender Validation

In [None]:
x = books.sample(100)

In [None]:
count_correct = 0
count_incorrect = 0
count_not_found = 0

In [None]:
count_not_found +=1

In [None]:
count_incorrect +=1

In [None]:
count_correct +=1

# 12. Check how many books are in different languages

In [None]:
books.language.value_counts(normalize=True).head()
bku = list(books.book_code.unique())
count = []
for b in bku:
    langs = books[books.book_code==b].language.value_counts().rename_axis('lang').reset_index(name='counts')
    count.append(len(langs.lang))
multi = 0
for c in count:
    if c>1:
        multi+=1
multi/len(bku)