<h1><center>Book review ratings. Analysis and visualization</center></h1>

<center><img src="https://www.detroitlabs.com/wp-content/uploads/2018/02/alfons-morales-YLSwjSy7stw-unsplash.jpg"></center>

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:Brown; border:0' role="tab" aria-controls="home"><center>Quick navigation</center></h3>

* [1. Data visualization](#1)
* [2. Feature engineering](#2)

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from iso3166 import countries

<a id="1"></a>
<h2 style='background:brown; border:0; color:white'><center>1. Data visualization<center><h2>

In [None]:
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Users.csv', sep=';', names=u_cols, encoding='latin-1', low_memory=False, skiprows=1)
b_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
books = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX_Books.csv', sep=';', names=b_cols, encoding='latin-1', low_memory=False, skiprows=1)
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1', low_memory=False, skiprows=1)

In [None]:
df = pd.merge(users, ratings, on='user_id')
df = pd.merge(df, books, on='isbn')
df

First let's see rating distribution

In [None]:
ds = df['rating'].value_counts().reset_index()

ds.columns = [
    'value', 
    'count'
]

fig = px.bar(
    ds, 
    x='value', 
    y="count", 
    orientation='v', 
    title='Ranking distribution', 
    width=800,
    height=600
)

fig.show()

What about year of publication?

In [None]:
ds = df['year_of_publication'].value_counts().reset_index()

ds.columns = [
    'value', 
    'count'
]

ds['value'] = ds['value'].astype(str) + ' year'
ds = ds.sort_values('count').tail(50)

fig = px.bar(
    ds, 
    x='count', 
    y="value", 
    orientation='h', 
    title='Top 50 years of publishing', 
    width=800,
    height=900
)

fig.show()

Let's check authors now?

In [None]:
ds = df['book_author'].value_counts().reset_index()

ds.columns = [
    'author', 
    'count'
]

ds = ds.sort_values('count').tail(50)

fig = px.bar(
    ds, 
    x='count', 
    y="author", 
    orientation='h', 
    title='Authors with largest number of votes', 
    width=800,
    height=900
)

fig.show()

Let's do the same for books.

In [None]:
ds = df['book_title'].value_counts().reset_index()

ds.columns = [
    'book_title', 
    'count'
]

ds = ds.sort_values('count').tail(50)

fig = px.bar(
    ds, 
    x='count', 
    y='book_title', 
    orientation='h', 
    title='Books with largest number of votes', 
    width=800,
    height=900
)

fig.show()

Age of users looks really impressive. Some of them are greater than 200 years old !!!)))

In [None]:
fig = px.histogram(
    df, 
    "age", 
    nbins=100, 
    title='Age distribution', 
    width=700,
    height=600
)

fig.show()

Let's see average age for every rank

In [None]:
data = df.groupby('rating')['age'].mean().reset_index()

fig = px.bar(
    data, 
    x="rating", 
    y="age", 
    orientation='v', 
    title='Average age for every raiting',
    width=800,
    height=700
)

fig.show()

Let's check users that made more reviews than others

In [None]:
users = df['user_id'].value_counts().reset_index()

users.columns = [
    'user_id', 
    'evaluation_count'
]

users['user_id'] = 'user ' + users['user_id'].astype(str)
users = users.sort_values('evaluation_count')

fig = px.bar(
    users.tail(50), 
    x="evaluation_count", 
    y="user_id", 
    orientation='h', 
    title='Top 50 book reviewers',
    width=800,
    height=900
)

fig.show()

What about users with the best loyality?

In [None]:
users = df['user_id'].value_counts().reset_index()

users.columns = [
    'user_id', 
    'evaluation_count'
]

df = pd.merge(df, users)
mean_df = df[df['evaluation_count']>100]
mean_df = mean_df.groupby('user_id')['rating'].mean().reset_index().sort_values('rating')
mean_df['user_id'] = 'user ' + mean_df['user_id'].astype(str)

fig = px.bar(
    mean_df.tail(50), 
    x="rating", 
    y="user_id", 
    orientation='h', 
    title='Top 50 users with highest avarage rating (more than 100 evaluations)',
    width=800,
    height=900
)

fig.show()

In [None]:
books = df['book_title'].value_counts().reset_index()
books.columns = ['book_title', 'book_evaluation_count']
df = pd.merge(df, books)
mean_df = df[df['book_evaluation_count']>100]
mean_df = mean_df.groupby('book_title')['rating'].mean().reset_index().sort_values('rating')

fig = px.bar(
    mean_df.tail(50), 
    x="rating", 
    y="book_title", 
    orientation='h', 
    title='Top 50 books with highest avarage rating (more than 100 evaluations)',
    width=1000,
    height=900
)
fig.show()

In [None]:
books = df['publisher'].value_counts().reset_index()
books.columns = ['publisher', 'publisher_evaluation_count']
df = pd.merge(df, books)
mean_df = df[df['publisher_evaluation_count']>100]
mean_df = mean_df.groupby('publisher')['rating'].mean().reset_index().sort_values('rating')

fig = px.bar(
    mean_df.tail(50), 
    x="rating", 
    y="publisher", 
    orientation='h', 
    title='Top 50 publishers with highest avarage rating (more than 100 evaluations)',
    width=1000,
    height=900
)
fig.show()

In [None]:
books = df['book_author'].value_counts().reset_index()
books.columns = ['book_author', 'author_evaluation_count']
df = pd.merge(df, books)
mean_df = df[df['author_evaluation_count']>100]
mean_df = mean_df.groupby('book_author')['rating'].mean().reset_index().sort_values('rating')

fig = px.bar(
    mean_df.tail(50), 
    x="rating", 
    y="book_author", 
    orientation='h', 
    title='Top 50 authors with highest avarage rating (more than 100 evaluations)',
    width=1000,
    height=900
)
fig.show()

<a id="2"></a>
<h2 style='background:brown; border:0; color:white'><center>2. Feature engineering<center><h2>

### Let's create columns based on location column

In [None]:
df['country'] = df['location'].str.split(',').str[2].str.lstrip().str.rstrip()
df['state'] = df['location'].str.split(',').str[1].str.lstrip().str.rstrip()
df['city'] = df['location'].str.split(',').str[0].str.lstrip().str.rstrip()
df = df.drop(['location', 'img_s', 'img_m', 'img_l'], axis=1)

# Work in Progress