# Data Exploration and more for the goodreads data

# 1. Start

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read csv, use error_bad_lines=False because there are some errors
df = pd.read_csv('/kaggle/input/goodreadsbooks/books.csv', error_bad_lines=False, index_col='bookID')

In [None]:
# print head of data
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.where(df == 102).sum()

# 2. What types of languages are there?

In [None]:
# get all unique values for "language_code"
list_of_lang = df['language_code'].unique()
print(list_of_lang)

In [None]:
# how many in each
langs = df['language_code'].value_counts()
print(langs)

In [None]:
import seaborn as sns


# Set the width and height of the figure
plt.figure(figsize=(14,6))

# Bar chart showing average arrival delay for Spirit Airlines flights by month
#sns.barplot(x=flight_data.index, y=flight_data['NK'])
sns.barplot(x=langs.index, y=langs)

# 3. Convert en-US and en-GB in eng

It seems unnecessary that there so many 'different' english languages, so we change all of them to just eng

In [None]:
# replace all en-XX with eng
df_better = df.copy()
df_better.replace(to_replace=('en-US', 'en-GB', 'en-CA'), value='eng',inplace=True)
df_better['language_code'].value_counts()

In [None]:
df_better.where(df_better == 102).sum()

# 4. change '  num_pages' to 'num_pages'

For some reason, there are 2 spaces in 'num_pages'. We want to get rid of them.

In [None]:
# rename the oddly named column
df_better.rename(columns = {'  num_pages':'num_pages'}, inplace = True) 
print(df_better.columns)

In [None]:
df.where(df == 102).sum()

# 5. Explore number of pages in Histogram

In [None]:
# using seaborn
fig, ax = plt.subplots(figsize=(16, 4))
sns.histplot(df_better['num_pages'],  bins=60, kde=True)
plt.show()

data seems skewed, let's visualize it better

In [None]:
# visualize skewedness
from scipy import stats
stats.probplot(df_better['num_pages'], plot=plt)

The data indeed is skewed.

# 6. are there any duplicates??

In [None]:
# complete duplicates
df_better.duplicated().sum()

In [None]:
# only title duplicates
df_better.duplicated(subset = 'title').sum()

In [None]:
# show ranking of title duplicates
df_better['title'].value_counts()[:10]

In [None]:
# only authors duplicates
df_better.duplicated(subset = 'authors').sum()

In [None]:
# show ranking of author duplicates
df_better['authors'].value_counts()[:10]

In [None]:
# only isbn duplicates
duple_isbn = df_better.duplicated(subset = 'isbn13')
duple_isbn.sum()

# 7. Correlations??

In [None]:
df_better.info()

In [None]:
#only use sensible columns for correlation
df_better_corr = df_better.select_dtypes(exclude=['object']).copy()
df_better_corr.drop(['isbn13'], axis=1, inplace=True)

In [None]:
df_better_corr.info()

In [None]:
# get correlations and show heatmap
corr=df_better_corr.corr(method='pearson')
sns.heatmap(data=corr, annot=True)

In [None]:
# mega scatterplot
sns.pairplot(df_better_corr, height = 2)

In [None]:
#other graphic
sns.jointplot(x="average_rating", y="num_pages", data = df_better, kind='reg')

In [None]:
# same without outliers
sns.jointplot(x="average_rating", y="num_pages", data = df_better[df_better.num_pages < 1000], color = 'darkcyan', kind='reg')

# 8. add a better rating system (WR)

This rating system takes into account the number of ratings. So a book with one single 5.0 rating won't be the best:

Weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C , where:

* R = average for the movie (mean) = (Rating)
* v = number of votes for the movie = (votes)
* m = minimum votes required to be listed in the Top 250 (currently 3000)
* C = the mean vote across the whole report (currently 6.9)

from: https://stats.stackexchange.com/questions/6418/rating-system-taking-account-of-number-of-votes


In [None]:
# define variables
m = 100

# calculate C (assuming this has to be done before dropping rows according to m)
C = df_better['average_rating'].mean()
print('mean vote across report', C)

In [None]:
# define weighted rating function
def WR(R, v, m, C):
    WR = (v/(v+m))*R+(m/(v+m))*C
    return WR

In [None]:
# find out how many ratings below 3000
m_out = df_better['ratings_count'].where(df_better['ratings_count'] > m).isna()
m_out_sum = m_out.sum()
print('books below m ratins:', m_out_sum)

In [None]:
# drop rows with ratings<m
# This will make all nan that is below m
df_better_WR = df_better.copy()
df_better_WR['ratings_count'].where(df_better['ratings_count'] > m, inplace=True)
# drops rows with any nan
df_better_WR.dropna(axis=0, how='any', inplace=True)
df_better_WR.shape

In [None]:
# add column with WR

# get a list with all WR
WR_list = WR(df_better['average_rating'], df_better['ratings_count'], m, C)
# add list as column
df_better_WR['WR'] = WR_list

In [None]:
# confirm new column
df_better_WR.head()

In [None]:
# list top three according to WR:
df_better_WR.nlargest(3, 'WR', keep='all')

# 9. Rating predictions

We cannot use WR here because it depends on the number of ratings

## 9.1 First approach

In [None]:
# check for nan
df_better_WR.isnull().values.any()

In [None]:
from sklearn.model_selection import train_test_split

# Create X and y
y = df_better_WR['average_rating'].copy()
X = df_better_WR[['num_pages', 'ratings_count', 'text_reviews_count']].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor


# Define the model
my_model = XGBRegressor(random_state=0, n_estimators=500, learning_rate=0.1) 

# Fit the model
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)


# Get predictions
predictions = my_model.predict(X_valid)

# Calculate MAE
mae = mean_absolute_error(predictions, y_valid)  

# Uncomment to print MAE
print("Mean Absolute Error:" , mae)

## 9.2 Feature Engineering

### 9.2.1 Number of books per author

First idea is to create a new feature from the author data, like number of books per author

In [None]:
## create dictionary from author's number of books
author_dict = df_better_WR['authors'].value_counts().to_dict()

In [None]:
# add column to DF according to dict
df_better_WR['no_books_author'] = df_better_WR['authors'].map(author_dict)
# add this column to X
X2 = X.copy()
X2['no_books_author'] = df_better_WR['no_books_author'].copy()
y2 = y.copy()

In [None]:
X2.head()

In [None]:
# new training

# Break off validation set from training data
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X2, y2, train_size=0.8, test_size=0.2, random_state=0)

# Define the model
my_model2 = XGBRegressor(random_state=0, n_estimators=500, learning_rate=0.1)

# Fit the model
my_model2.fit(X_train2, y_train2, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid2, y_valid2)], 
             verbose=False)


# Get predictions
predictions2 = my_model2.predict(X_valid2)

# Calculate MAE
mae2 = mean_absolute_error(predictions2, y_valid2) 

# Uncomment to print MAE
print("Mean Absolute Error:" , mae2)

Improved only slightly

### 9.2.2 Include year

In [None]:
df_better_WR['publication_date']

In [None]:
# convert last 4 strings of publication date to int for new feature 'publication year'
df_better_WR['publication_year'] = df_better_WR['publication_date'].map(lambda x: x[-4:])
df_better_WR['publication_year'] = df_better_WR['publication_year'].astype('int32') 
  
# create new X
X3 = X2.copy()
X3['publication_year'] = df_better_WR['publication_year']
y3 = y.copy()

In [None]:
# do the machine learning stuff

X_train3, X_valid3, y_train3, y_valid3 = train_test_split(X3, y3, train_size=0.8, test_size=0.2, random_state=0)
my_model3 = XGBRegressor(random_state=0, n_estimators=500, learning_rate=0.1)
my_model3.fit(X_train3, y_train3, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid3, y_valid3)], 
             verbose=False)
predictions3 = my_model3.predict(X_valid3)

# Calculate MAE
mae3 = mean_absolute_error(predictions3, y_valid3) 

# print MAE
print("Mean Absolute Error:" , mae3)

even better!

In [None]:
print(mae, mae2, mae3)

The MAE got better with every added feature.