In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
audible = pd.read_csv('audible_sample.csv')
audible.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0
2,Magic Tree House Collection: Books 9-16,Writtenby:MaryPopeOsborne,Narratedby:MaryPopeOsborne,5 hrs and 23 mins,24-08-11,English,5 out of 5 stars6 ratings,1206.0
3,Magnus Chase and the Ship of the Dead,Writtenby:RickRiordan,Narratedby:MichaelCrouch,12 hrs and 58 mins,03-10-17,English,5 out of 5 stars41 ratings,820.0
4,Geronimo Stilton #13 and #14,Writtenby:GeronimoStilton,Narratedby:BillLobley,2 hrs and 25 mins,08-02-08,English,4.5 out of 5 stars33 ratings,467.0


In [3]:
audible.isnull().sum()

name           0
author         0
narrator       0
time           0
releasedate    0
language       0
stars          0
price          0
dtype: int64

In [4]:
audible.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26231 entries, 0 to 26230
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         26231 non-null  object
 1   author       26231 non-null  object
 2   narrator     26231 non-null  object
 3   time         26231 non-null  object
 4   releasedate  26231 non-null  object
 5   language     26231 non-null  object
 6   stars        26231 non-null  object
 7   price        26231 non-null  object
dtypes: object(8)
memory usage: 1.6+ MB


In [5]:
audible['author'].nunique()

18341

In [6]:
def remove_unwanted_words(text):
    unwanted_words = [
        'traductrice', 'illustrateur', 'epilogue', 'translator', 'foreword', 'introduction', 'forewordby', 'editor', 
        'coverillustrator', 'foreward', 'compilation', 'with', 'curatore', 'translation', 'interviewer',
        'afterword', 'contribution', 'traductor', 'traduttore', 'ilustrador', 'music', 'preface', 'collaborator',
        'illustratore', 'AbridgedandIntroducedby', 'traducteur', 'contributor', 'forewordintroduction', 'producer',
        'translator/editor', 'editorforeword', 'adaptation', 'Afterwordby', 'abridgementandintroduction', 'forward', 
        'translatedby', 'commentaries', 'photographer', 'illustrator', 'featuring', 'editorintroduction',
        'introductionandtranslation', 'translatorintroduction', 'translatortransator', 'serieseditor', 
        'introductioncontributor'
    ]
    
    # Pattern to match unwanted words
    pattern = r'[-,\s]?(?:' + '|'.join(unwanted_words) + r')\b-?(?=,|$)'  # Updated pattern
    
    # Removing unwanted words
    text = re.sub(pattern, '', text, flags = re.IGNORECASE)
    
    # Removing extra spaces and hyphens
    text = re.sub(r'[-,\s]+$', '', text).strip()
    
    # Removing trailing comma
    if text.endswith(','):
        text = text[:-1]
    
    return text

In [7]:
audible['author'] = audible['author'].apply(remove_unwanted_words)
audible['author']

0        Writtenby:GeronimoStilton
1            Writtenby:RickRiordan
2        Writtenby:MaryPopeOsborne
3            Writtenby:RickRiordan
4        Writtenby:GeronimoStilton
                   ...            
26226         Writtenby:JeffShaara
26227     Writtenby:StephenBrennan
26228       Writtenby:ColinThubron
26229       Writtenby:ChrisStewart
26230      Writtenby:MarkKurlansky
Name: author, Length: 26231, dtype: object

In [8]:
def author_name(text):
    return text.split(':')[1]

audible['author'] = audible['author'].apply(author_name)
audible['author']

0        GeronimoStilton
1            RickRiordan
2        MaryPopeOsborne
3            RickRiordan
4        GeronimoStilton
              ...       
26226         JeffShaara
26227     StephenBrennan
26228       ColinThubron
26229       ChrisStewart
26230      MarkKurlansky
Name: author, Length: 26231, dtype: object

In [9]:
audible['narrator'] = audible['narrator'].apply(remove_unwanted_words)
audible['narrator']

0             Narratedby:BillLobely
1         Narratedby:JesseBernstein
2        Narratedby:MaryPopeOsborne
3          Narratedby:MichaelCrouch
4             Narratedby:BillLobley
                    ...            
26226      Narratedby:RobertsonDean
26227     Narratedby:KevinStillwell
26228     Narratedby:JonathanKeeble
26229       Narratedby:ChrisStewart
26230        Narratedby:FleetCooper
Name: narrator, Length: 26231, dtype: object

In [10]:
def narrator_name(text):
    return text.split(':')[1]

audible['narrator'] = audible['narrator'].apply(narrator_name)
audible['narrator']

0             BillLobely
1         JesseBernstein
2        MaryPopeOsborne
3          MichaelCrouch
4             BillLobley
              ...       
26226      RobertsonDean
26227     KevinStillwell
26228     JonathanKeeble
26229       ChrisStewart
26230        FleetCooper
Name: narrator, Length: 26231, dtype: object

In [11]:
def extract_numbers(text):
    pattern = r'\d+' # Regular expression pattern to match digits
    numbers = re.findall(pattern, text) # Finding all matches of the pattern in the text  
    return [int(num) for num in numbers] # Converting the numbers from strings to integers

In [12]:
audible['time'] = audible['time'].apply(extract_numbers)
audible['time']

0         [2, 20]
1            [10]
2         [5, 23]
3        [12, 58]
4         [2, 25]
           ...   
26226        [31]
26227    [12, 17]
26228    [11, 51]
26229     [7, 34]
26230      [6, 1]
Name: time, Length: 26231, dtype: object

In [13]:
def hour_to_minutes(time):
    if len(time) == 2:
        hours, mins = time
        return hours * 60 + mins
    elif len(time) == 1:
        hours = time[0]
        return hours * 60

audible['time'] = audible['time'].apply(hour_to_minutes)
audible['time']

0         140
1         600
2         323
3         778
4         145
         ... 
26226    1860
26227     737
26228     711
26229     454
26230     361
Name: time, Length: 26231, dtype: int64

In [14]:
audible.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26231 entries, 0 to 26230
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         26231 non-null  object
 1   author       26231 non-null  object
 2   narrator     26231 non-null  object
 3   time         26231 non-null  int64 
 4   releasedate  26231 non-null  object
 5   language     26231 non-null  object
 6   stars        26231 non-null  object
 7   price        26231 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


In [15]:
def date_change(date_str):
    formats = ['%d/%m/%Y', '%d-%m-%y'] # DD/MM/YYYY or dd-mm-yy
    for fmt in formats:
        try:
            date_object = datetime.strptime(date_str, fmt) # Parsing the date string into a datetime object
            formatted_date = date_object.strftime('%Y-%m-%d') # Formatting the datetime object into yyyy-mm-dd format
            return formatted_date 
        except ValueError:
            pass 

In [16]:
audible['releasedate'] = audible['releasedate'].apply(date_change)

In [17]:
audible.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26231 entries, 0 to 26230
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         26231 non-null  object
 1   author       26231 non-null  object
 2   narrator     26231 non-null  object
 3   time         26231 non-null  int64 
 4   releasedate  26231 non-null  object
 5   language     26231 non-null  object
 6   stars        26231 non-null  object
 7   price        26231 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


In [18]:
audible['releasedate'] = pd.to_datetime(audible['releasedate'], errors='coerce')
audible['releasedate']

0       2008-08-04
1       2010-01-13
2       2011-08-24
3       2017-10-03
4       2008-02-08
           ...    
26226   2007-03-26
26227   2013-02-28
26228   2018-01-01
26229   2017-03-09
26230   2017-03-07
Name: releasedate, Length: 26231, dtype: datetime64[ns]

In [19]:
audible['language'].unique()

array(['English', 'Hindi', 'french', 'italian', 'spanish', 'swedish',
       'german', 'finnish', 'catalan', 'dutch', 'russian', 'danish',
       'afrikaans', 'polish', 'galician', 'romanian', 'japanese',
       'portuguese', 'icelandic', 'czech', 'bulgarian',
       'mandarin_chinese', 'hungarian', 'urdu', 'korean', 'greek',
       'turkish', 'arabic', 'tamil', 'norwegian', 'slovene'], dtype=object)

In [20]:
def proper_case(text):
    return text.title()

audible['language'] = audible['language'].apply(proper_case)
audible['language']

0        English
1        English
2        English
3        English
4        English
          ...   
26226    English
26227    English
26228    English
26229    English
26230    English
Name: language, Length: 26231, dtype: object

In [21]:
audible['language'].unique()

array(['English', 'Hindi', 'French', 'Italian', 'Spanish', 'Swedish',
       'German', 'Finnish', 'Catalan', 'Dutch', 'Russian', 'Danish',
       'Afrikaans', 'Polish', 'Galician', 'Romanian', 'Japanese',
       'Portuguese', 'Icelandic', 'Czech', 'Bulgarian',
       'Mandarin_Chinese', 'Hungarian', 'Urdu', 'Korean', 'Greek',
       'Turkish', 'Arabic', 'Tamil', 'Norwegian', 'Slovene'], dtype=object)

In [22]:
def extract_ratings(text):
    if 'stars' in text:
        match = re.search(r'\d+', text.split('stars')[1])
        if match:
            return int(match.group()) # returns a string representing the matched digits (if any) in the text
    return 0

audible['ratings'] = audible['stars'].apply(extract_ratings)

In [23]:
def extract_stars(text):
    i = text.split(' ')[0]
    try:
        return float(i)
    except ValueError:
        return i

audible['stars'] = audible['stars'].apply(extract_stars)
audible['stars']

0          5
1        4.5
2          5
3          5
4        4.5
        ... 
26226    Not
26227    Not
26228    Not
26229    Not
26230    Not
Name: stars, Length: 26231, dtype: object

In [24]:
audible['stars'] = audible['stars'].replace('Not', 0)
audible['stars'] = pd.to_numeric(audible['stars'])

In [25]:
audible['stars'].unique()

array([5. , 4.5, 4. , 0. , 3.5, 3. , 2. , 1. , 2.5, 1.5])

In [26]:
audible.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26231 entries, 0 to 26230
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   name         26231 non-null  object        
 1   author       26231 non-null  object        
 2   narrator     26231 non-null  object        
 3   time         26231 non-null  int64         
 4   releasedate  26231 non-null  datetime64[ns]
 5   language     26231 non-null  object        
 6   stars        26231 non-null  float64       
 7   price        26231 non-null  object        
 8   ratings      26231 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 1.8+ MB


In [27]:
audible.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,140,2008-08-04,English,5.0,468.0,34
1,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,600,2010-01-13,English,4.5,820.0,181
2,Magic Tree House Collection: Books 9-16,MaryPopeOsborne,MaryPopeOsborne,323,2011-08-24,English,5.0,1206.0,6
3,Magnus Chase and the Ship of the Dead,RickRiordan,MichaelCrouch,778,2017-10-03,English,5.0,820.0,41
4,Geronimo Stilton #13 and #14,GeronimoStilton,BillLobley,145,2008-02-08,English,4.5,467.0,33


In [28]:
def price_mod(text):
    if 'Free' in text:
        return 0.00
    else:
        text = text.replace(',', '')
        return float(text)
    
audible['price'] = audible['price'].apply(price_mod)

In [29]:
audible['price']
audible['price'] = pd.to_numeric(audible['price'])

In [30]:
audible.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26231 entries, 0 to 26230
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   name         26231 non-null  object        
 1   author       26231 non-null  object        
 2   narrator     26231 non-null  object        
 3   time         26231 non-null  int64         
 4   releasedate  26231 non-null  datetime64[ns]
 5   language     26231 non-null  object        
 6   stars        26231 non-null  float64       
 7   price        26231 non-null  float64       
 8   ratings      26231 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 1.8+ MB


In [31]:
audible.describe()

Unnamed: 0,time,stars,price,ratings
count,26231.0,26231.0,26231.0,26231.0
mean,652.72178,0.761008,559.138881,2.372041
std,644.13113,1.707095,335.070331,21.111288
min,60.0,0.0,0.0,0.0
25%,292.0,0.0,268.0,0.0
50%,481.0,0.0,585.0,0.0
75%,720.0,0.0,755.0,0.0
max,8595.0,5.0,7198.0,930.0


In [32]:
audible.head(20)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,ratings
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,140,2008-08-04,English,5.0,468.0,34
1,"The Lightning Thief: Percy Jackson, Book 1",RickRiordan,JesseBernstein,600,2010-01-13,English,4.5,820.0,181
2,Magic Tree House Collection: Books 9-16,MaryPopeOsborne,MaryPopeOsborne,323,2011-08-24,English,5.0,1206.0,6
3,Magnus Chase and the Ship of the Dead,RickRiordan,MichaelCrouch,778,2017-10-03,English,5.0,820.0,41
4,Geronimo Stilton #13 and #14,GeronimoStilton,BillLobley,145,2008-02-08,English,4.5,467.0,33
5,Exile,ShannonMessenger,CaitlinKelly,881,2018-11-06,English,5.0,836.0,20
6,Merlin Mission Collection,MaryPopeOsborne,MaryPopeOsborne,618,2017-05-02,English,5.0,1256.0,11
7,Neverseen,ShannonMessenger,CaitlinKelly,1002,2018-11-06,English,5.0,1003.0,13
8,Magic Tree House Collection: Books 25-32,MaryPopeOsborne,MaryPopeOsborne,429,2020-09-01,English,5.0,1256.0,3
9,Geronimo Stilton #20 and #21,GeronimoStilton,BillLobley,152,2010-10-01,English,5.0,469.0,18


In [33]:
audible.to_csv('audible_edited.csv', index = False)