### Load data

In [196]:
import pandas as pd
books_data = pd.read_csv('data/books_data.csv', nrows = 10000)
books_ratings = pd.read_csv('data/books_rating.csv', nrows=10000)

In [197]:
# Titles
books_data_1 = books_data.drop(['image', 'previewLink', 'infoLink', 'publisher', 'ratingsCount'], axis=1, inplace=False)
books_ratings_1 = books_ratings.drop(['Id','Price', 'profileName', 'review/time'], axis=1, inplace=False)

books_data_1.rename(columns={'Title': 'title','publishedDate': 'published_date'}, inplace=True)
books_ratings_1.rename(columns={'Title': 'title','User_id':'user_id',  'review/score': 'score','review/helpfulness': 'helpfulness', 'review/text': 'text', 'review/summary': 'summary'}, inplace=True)

print(list(books_data_1.columns))
print(list(books_ratings_1.columns))


['title', 'description', 'authors', 'published_date', 'categories']
['title', 'user_id', 'helpfulness', 'score', 'summary', 'text']


In [198]:
# load book data for books in book ratings
ratings_titles = books_ratings_1['title'].unique()
books_data_1 = books_data_1[books_data_1['title'].isin(ratings_titles)]

In [199]:
# convert helpfullness to float

books_ratings_1['helpfulness'].replace('0/0', 0, inplace=True)

help_pct = []
help_count = []

for i in range(len(books_ratings_1['helpfulness'])):
    if books_ratings_1['helpfulness'][i] != 0:
        help_pct.append(float(books_ratings_1['helpfulness'][i].split('/')[0])/float(books_ratings_1['helpfulness'][i].split('/')[1]))
        help_count.append(float(books_ratings_1['helpfulness'][i].split('/')[1]))
    else:
        help_pct.append(0)
        help_count.append(0)

books_ratings_1['helpfulness_pct'] = help_pct
books_ratings_1['helpfulness_count'] = help_count

# remove helpfullness column
books_ratings_1.drop(['helpfulness'], axis=1, inplace=True)

In [200]:
import numpy as np
# count unique ids in books_ratings_1
books_data_1.dropna(subset=['title'], inplace = True)
entry_count = books_ratings_1['title'].value_counts(sort=False)
counts_titles = []
average_ratings = []
for idx, title in enumerate(books_data_1['title']):
    if entry_count[title] > 2:
        average_ratings.append(books_ratings_1[books_ratings_1['title']==title]['score'].mean())
        counts_titles.append(entry_count[title])
    else:
        average_ratings.append(np.nan)
        counts_titles.append(np.nan)

books_data_1['ratings_count'] = counts_titles
books_data_1['average_rating'] = average_ratings

books_data_1.dropna(subset=['ratings_count'], inplace = True)
# reset index
books_data_1.reset_index(drop=True, inplace=True)

# remove books with less than 2 ratings from books_ratings_1
books_ratings_1 = books_ratings_1[books_ratings_1['title'].isin(books_data_1['title'])]

In [201]:
# convert published_date to datetime
year = []
for date in books_data_1['published_date']:
    try:
        year.append(date.split('-')[0])
    except:
        year.append(0)

year2 = []
for y in year:
    try:
        year2.append(y.split('*')[0])
    except:
        year2.append(y)

books_data_1['published_date'] = year2

# convert published_date to int
books_data_1['published_date'] = books_data_1['published_date'].astype(int)

print(books_data_1['published_date'])

0      2005
1      2000
2      2005
3      1996
4      1988
       ... 
431    2001
432    2011
433    1966
434    2003
435    2013
Name: published_date, Length: 436, dtype: int32


In [202]:
# create new folder for clean data
import os
if not os.path.exists('clean_data'):
    os.makedirs('clean_data')

# save clean data
books_data_1.to_csv('clean_data/books_data_clean.csv')
books_ratings_1.to_csv('clean_data/books_rating_clean.csv')