# Data Preprocessing and Data Analysis

### Getting Ready

Importing libraries and setting some variables

In [1]:
import pandas as pd
import numpy as np
import ast
import glob

RATING_THRESHOLD = 1000

Preprocessing function

In [2]:
def preprocess_df(df):
    df = df.copy()
    # Dropping missing values in essential columns
    df.dropna(subset=['author', 'summary', 'genres'], inplace=True)

    # Dropping unnecessary columns
    df.drop(columns=['about_author', 'community_reviews', 'kindle_price', 'num_reviews', 'id'], inplace=True)

    # Converting 'author' from string representation of list to actual list and extracting the first author
    df['author'] = df['author'].apply(lambda x: ast.literal_eval(x)[0])

    # There are some authors with no name or unknown names, we can filter them out
    na_authors = ['Anonymous', 'Unknown', 'Various']
    df = df[~df['author'].isin(na_authors)]

    # Dropping rows with missing or insignificant summaries
    df = df[~df['summary'].str.len() < 75]

    # Converting 'genres' from string representation of list to actual list and joining them into a single string
    df['genres'] = df['genres'].apply(lambda x: " ".join(ast.literal_eval(x)))

    # Utilizing 'w_rating' (weighted rating) that makes 'star_rating' more valuable when more people ('num_ratings') do rate it
    # Also setting a threshold for 'num_ratings' to filter out books with too few ratings
    df = df[df['num_ratings'] > RATING_THRESHOLD]
    df['w_rating'] = (df['num_ratings'] / (df['num_ratings'] + RATING_THRESHOLD) * df['star_rating'] +
                  RATING_THRESHOLD / (df['num_ratings'] + RATING_THRESHOLD) * df['star_rating'].mean()).round(2)

    # Adding a new column 'tags' that combines genres, author, and summary for better searchability
    df['tags'] = df['genres'].str.lower() + ' ' \
        + df['author'].str.lower().replace(' ', '') + ' ' \
        + df['summary'].str.lower()
    
    # Clean up punctuation
    df['tags'] = df['tags'].str.replace(r'[^\w\s]', '', regex=True).str.replace(r'\s+', ' ', regex=True)
    
    return df

### 1. Data Preprocessing stage

Detecting all `.parquet` files

In [3]:
parquet_files = glob.glob('../datasets/*.parquet')

Preprocessing all `.parquet` files and appending to the `result` DataFrame.

In [None]:
# result = pd.DataFrame()
# for file in parquet_files:
#     print(f"Processing file: {file}")
#     result = pd.concat([result, pre(pd.read_parquet(file))], ignore_index=True)

Processing file: ../datasets\0000.parquet
Processing file: ../datasets\0001.parquet
Processing file: ../datasets\0002.parquet
Processing file: ../datasets\0003.parquet
Processing file: ../datasets\0004.parquet
Processing file: ../datasets\0005.parquet
Processing file: ../datasets\0006.parquet
Processing file: ../datasets\0007.parquet
Processing file: ../datasets\0008.parquet
Processing file: ../datasets\0009.parquet


Resetting index

In [9]:
result.reset_index(drop=True, inplace=True)

Saving the `result` DataFrame

In [14]:
result.to_parquet('../datasets/processed_books.parquet', index=False)

### 2. Data Analysis with the whole processed dataset