# Data Preprocessing

Import libraries

In [111]:
import pandas as pd
import numpy as np
import unicodedata
import string


Preprocessing functions

In [77]:
def clean_text(text):
    text = ''.join(char for char in text if char in string.printable)
    return text

In [104]:
def preprocessing(raw_df: pd.DataFrame):
    raw_df = raw_df.loc[:, ['title', 'author', 'desc', 'genre', 'rating', 'reviews', 'totalratings',
                            'pages', 'img', 'link', 'isbn']]
    raw_df.dropna(subset=['desc', 'title', 'img'], inplace=True)
    raw_df['desc'] = raw_df['desc'].apply(clean_text)
    raw_df['score'] = raw_df['rating'] * raw_df['totalratings']
    raw_df['score'] = (raw_df['score'] / raw_df['score'].max()).round(3)
    raw_df.drop_duplicates(subset=['title'], inplace=True)
    raw_df['genre'] = raw_df['genre'].str.split(',')
    raw_df['author'] = raw_df['author'].str.split(',')
    raw_df['genre'].fillna('[]', inplace=True)
    raw_df['isbn'].fillna('Missing', inplace=True)
    raw_df = raw_df[raw_df['desc'].str.len() > 75]
    return raw_df

Preprocess the raw dataset and save it

In [105]:
df = pd.read_csv(r'..\datasets\GoodReads_100k_books.csv')
df = preprocessing(df)
df.to_csv(r'..\datasets\dataset.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_df['genre'].fillna('[]', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_df['isbn'].fillna('Missing', inplace=True)


Load the preprocessed dataset

In [106]:
df = pd.read_csv(r'..\datasets\dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87687 entries, 0 to 87686
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         87687 non-null  object 
 1   author        87687 non-null  object 
 2   desc          87687 non-null  object 
 3   genre         87687 non-null  object 
 4   rating        87687 non-null  float64
 5   reviews       87687 non-null  int64  
 6   totalratings  87687 non-null  int64  
 7   pages         87687 non-null  int64  
 8   img           87687 non-null  object 
 9   link          87687 non-null  object 
 10  isbn          87687 non-null  object 
 11  score         87687 non-null  float64
dtypes: float64(2), int64(3), object(7)
memory usage: 8.0+ MB


### Get the dataset ready for modelling

Create a new column `tags`

In [None]:
df['desc'] = df['desc'].str.replace(r'[^\w\s]', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.lower()

In [None]:
df['author'] = df['author'].apply(lambda x: " ".join(i.replace(" ", "") for i in ast.literal_eval(x)).lower())

In [None]:
df['genre'] = df['genre'].apply(lambda x: " ".join(i.replace(" ", "") for i in ast.literal_eval(x)).lower())

Create a `tags` column which combines `author`, `genre` and `desc`

In [None]:
df['tags'] = df['author'] + ' ' + df['genre'] + ' ' + df['desc']

Revert these columns to initial state

In [None]:
reset_df = pd.read_csv(r'..\datasets\dataset.csv')

In [None]:
df['desc'], df['author'], df['genre'] = reset_df['desc'], reset_df['author'], reset_df['genre']

In [None]:
df.to_csv(r'..\datasets\dataset_with_tags.csv', index=False)