In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [3]:
author = pd.read_csv("./data/audible_uncleaned.csv")

In [4]:
author.head(5)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


## Checking datatypes

In [5]:
author.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


- time should be converted into min
- release date should be converted into datetime
- price to be integer

## Analyzing each feature

### name

In [6]:
author.name.value_counts().reset_index()

Unnamed: 0,name,count
0,The Art of War,20
1,Sterling Biographies,19
2,The Odyssey,16
3,Sterling Point Books,16
4,Hamlet,15
...,...,...
82762,Resumen de Inteligencia artificial de Pablo Ro...,1
82763,Resumen de La clave de la venta de Jeffrey Lip...,1
82764,SDGsが生み出す未来のビジネス（できるビジネス）,1
82765,弱者のポジショニング戦略,1


### author

In [7]:
author.author.str.split(":").str[1]

0        GeronimoStilton
1            RickRiordan
2             JeffKinney
3            RickRiordan
4            RickRiordan
              ...       
87484       ChrisStewart
87485      StephenO'Shea
87486          MarkTwain
87487     LaurenceSterne
87488      MarkKurlansky
Name: author, Length: 87489, dtype: object

In [8]:
### all stars outof 5 so remove other stuff -> and number rating create other feature

In [9]:
author['stars'].str.split("stars").str[0].str.split(" ", n=1).str[0]

0          5
1        4.5
2        4.5
3        4.5
4        4.5
        ... 
87484    Not
87485    Not
87486    Not
87487    Not
87488    Not
Name: stars, Length: 87489, dtype: object

In [10]:
def clean_numeric(ser: pd.Series):
    pass

In [11]:
### cleaning data

In [12]:
def clean(df):
    return (
        df
        .assign(
            # author
            author=lambda df_: (
                df_.author
                .str.split(":").str[1]
            ),

            # narrator
            narrator=lambda df_: (
                df_.narrator.str.split(":").str[1]
            ),

            # time
            hour=lambda df_: (
                df_.time
                .replace(r"[^0-9]", " ", regex=True).str.strip().str.split(" ").str[0]
                .astype('int')
            ),
            minute=lambda df_: (
                df_.time
                .replace(r"[^0-9]", " ", regex=True).str.strip().str.split(" ").str[-1]
                .astype('int')
            ),
            time_min=lambda df_: (
                df_.hour*60 + df_.minute
            ),

            # releasedate
            releasedate=lambda df_: pd.to_datetime(
                df_.releasedate,
                format='mixed'
            ),

            #stars
            stars=lambda df_: (
                df_.stars.str.split("stars").str[0].str.split(" ", n=1).str[0]
                .replace("Not", "0")
                .astype("float")
            ),

             #price
            price=lambda df_: (
                df_.price
                .str.replace("Free", "0").str.replace(",","")
                .astype("float")
            ),
        )
        .drop(columns=['hour', 'minute', 'releasedate', 'time', 'stars'])
    )

In [13]:
clean_author = clean(author)

In [14]:
clean_author.sample(5)

Unnamed: 0,name,author,narrator,language,price,time_min
48034,Bible Stories,LoganMarshall,MichaelStevens,English,434.0,293
4504,Disney Classics,DisneyBookGroup,DisneyBookGroup,English,117.0,2013
84534,I Am Princess X,CheriePriest,MaryRobinetteKowal,English,575.0,419
4427,"Pre- and Re-, Mis- and Dis-",BrianP.Cleary,Intuitive,English,164.0,183
18969,あした、元気になぁれ！~新人Vtuber がV のこれからについて語った本~,ぷろぽりす幸子,ぷろぽりす幸子,japanese,530.0,1891


In [15]:
clean_author.sample(5)

Unnamed: 0,name,author,narrator,language,price,time_min
56775,Sacred Sex,TonyEvans,MirronWillis,English,351.0,104
57831,Red Alert,"StephenLeeb,GregoryDorsey",BrianBascle,English,500.0,456
36564,The Squeaky Wheel,GuyWinchPh.D.,GuyWinchPh.D.,English,134.0,461
9914,Frau Honig und das Glück der kleinen Dinge,SabineBohlmann,SabineBohlmann,german,300.0,190
29085,Meatball Sundae,SethGodin,SethGodin,English,632.0,291


In [16]:
clean_author.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      87489 non-null  object 
 1   author    87489 non-null  object 
 2   narrator  87489 non-null  object 
 3   language  87489 non-null  object 
 4   price     87489 non-null  float64
 5   time_min  87489 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 4.0+ MB


In [17]:
# clean_author.to_csv("clean_author.csv", index=False)